import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree # for tree.plot_tree()
from sklearn.tree import export_text # for export_text()


X = np.array([0, 1]) # We don't care about these values--only their number.
xplot = np.linspace(0, 1)
H_X = np.zeros(len(xplot))
for i in np.arange(0, len(xplot)):
    p = xplot[i]
    P = np.array([p, 1 - p])
    if (0 < p) & (p < 1):
        H_X[i] = -np.sum(P * np.log2(P))
    else:
        H_X[i] = 0

plt.plot(xplot, H_X)
plt.title('Entropy of coin flip (Bernouli trial) is\n' +
          'average information in bits given by outcome.')
plt.xlabel(f'$P(X = 1)$ (heads)')
plt.ylabel(f'Entropy $H(X)$')
plt.show(block=False)


df_all = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv', index_col=0)
df = df_all.head(n=8)
df = df[['mpg', 'cyl', 'vs']]
df


print(f"Sorted by mpg:\n{df.sort_values('mpg')}\n")
print(f"Sorted by cyl:\n{df.sort_values('cyl')}\n")
feature_names = ['mpg', 'cyl']
X = df[feature_names]
y = df.vs
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)
clf.fit(X, y)
plt.rcParams["figure.figsize"] = (8, 7) # (width, height) https://matplotlib.org/stable/api/figure_api.html
tree.plot_tree(clf, feature_names=feature_names, filled=True)
# export_text() is from https://scikit-learn.org/stable/modules/tree.html
print(export_text(clf, feature_names=feature_names))
print(f'Accuracy on training data is clf.score(X, y)={clf.score(X, y)}.')

Sorted by mpg:
                    mpg  cyl  vs
Duster 360         14.3    8   0
Valiant            18.1    6   1
Hornet Sportabout  18.7    8   0
Mazda RX4          21.0    6   0
Mazda RX4 Wag      21.0    6   0
Hornet 4 Drive     21.4    6   1
Datsun 710         22.8    4   1
Merc 240D          24.4    4   1

Sorted by cyl:
                    mpg  cyl  vs
Datsun 710         22.8    4   1
Merc 240D          24.4    4   1
Mazda RX4          21.0    6   0
Mazda RX4 Wag      21.0    6   0
Hornet 4 Drive     21.4    6   1
Valiant            18.1    6   1
Hornet Sportabout  18.7    8   0
Duster 360         14.3    8   0

|--- mpg <= 21.20
|   |--- mpg <= 18.40
|   |   |--- mpg <= 16.20
|   |   |   |--- class: 0
|   |   |--- mpg >  16.20
|   |   |   |--- class: 1
|   |--- mpg >  18.40
|   |   |--- class: 0
|--- mpg >  21.20
|   |--- class: 1

Accuracy on training data is clf.score(X, y)=1.0.


def f_ID3(y): # y is a vector of 0s and 1s; returns proportion of 1s
    return (1 / y.size) * np.sum(y)

def I(p): # p is the probability of an outcome
    return -np.log2(p)

def H(y): # y is a vector of 0s and 1s
    p = f_ID3(y)
    if (0 < p) & (p < 1):
        return p * I(p) + (1 - p) * I(1 - p)
    else:
        return 0.0 # otherwise format specifier ":.3" used elsewhere complains about integer 0

def H_weighted(y_minus, y_plus): # y_minus and y_plus are each vectors of 0s and 1s
    n_minus = y_minus.size
    n_plus = y_plus.size
    n = n_minus + n_plus
    return (n_minus / n) * H(y_minus) + (n_plus / n) * H(y_plus)


def split(X, y, feature_names, debug=False):
    # X=array of features, y=vector of 0s and 1s, feature_names=strings
    min_H = np.Infinity
    best_j = np.NAN
    best_t = np.NAN
    n_features = X.shape[1]
    assert n_features == len(feature_names)
    for j in range(n_features):
        feature = X.iloc[:, j]
        unique = np.unique(feature)
        thresholds = (unique[1:] + unique[:-1]) / 2 # averages each adjacent pair of unique values
        if debug:
            print(f'feature_names[{j}]={feature_names[j]}')
            print(f'  unique    ={unique}')
            print(f'  thresholds={thresholds}')
        for t in thresholds:
            S_minus = y[feature <=  t]
            S_plus  = y[feature > t]
            H_split = H_weighted(S_minus, S_plus)
            if debug:
                print(f'    t={t:.3}, S_minus={S_minus}, S_plus={S_plus}, H_minus={H(S_minus):.3}, H_plus={H(S_plus):.3}, H_split={H_split:.3}')
            if H_split < min_H:
                min_H = H_split
                best_j = j
                best_t = t
    return (best_j, best_t, min_H)


def decision_tree(X, y, feature_names, depth=0, debug=False):
    # X=array of features, y=vector of 0s and 1s, feature_names=strings,
    # depth is of recursion for indenting output, debug activates debugging output
    if H(y) == 0:
        return
    padding = ' ' * depth * 2 # indent output by depth * 2 spaces
    if debug:
        print(f'{padding}decision_tree(X=------------------------------------------------------------')
        print(f'{padding}{X}, y={y}, feature_names={feature_names}')
    best_j, best_t, min_H = split(X, y, feature_names, debug)
    print(f'{padding}########## best_j={best_j}, best_feature={feature_names[best_j]}, best_t={best_t:.3}, min_H={min_H:.3}')
    le = (X.iloc[:, best_j] <=  best_t) # 'le'='less than or equal to'
    decision_tree(X[ le], y[ le], feature_names, depth+1, debug) # left branch
    decision_tree(X[~le], y[~le], feature_names, depth+1, debug) # right branch

decision_tree(X, y, feature_names, debug=True) # call it on small example data

decision_tree(X=------------------------------------------------------------
                    mpg  cyl
Mazda RX4          21.0    6
Mazda RX4 Wag      21.0    6
Datsun 710         22.8    4
Hornet 4 Drive     21.4    6
Hornet Sportabout  18.7    8
Valiant            18.1    6
Duster 360         14.3    8
Merc 240D          24.4    4, y=Mazda RX4            0
Mazda RX4 Wag        0
Datsun 710           1
Hornet 4 Drive       1
Hornet Sportabout    0
Valiant              1
Duster 360           0
Merc 240D            1
Name: vs, dtype: int64, feature_names=['mpg', 'cyl']
feature_names[0]=mpg
  unique    =[14.3 18.1 18.7 21.  21.4 22.8 24.4]
  thresholds=[16.2  18.4  19.85 21.2  22.1  23.6 ]
    t=16.2, S_minus=Duster 360    0
Name: vs, dtype: int64, S_plus=Mazda RX4            0
Mazda RX4 Wag        0
Datsun 710           1
Hornet 4 Drive       1
Hornet Sportabout    0
Valiant              1
Merc 240D            1
Name: vs, dtype: int64, H_minus=0.0, H_plus=0.985, H_split=0.862
    t=18.4, S_minus=Valiant       1
Duster 360    0
Name: vs, dtype: int64, S_plus=Mazda RX4            0
Mazda RX4 Wag        0
Datsun 710           1
Hornet 4 Drive       1
Hornet Sportabout    0
Merc 240D            1
Name: vs, dtype: int64, H_minus=1.0, H_plus=1.0, H_split=1.0
    t=19.9, S_minus=Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, S_plus=Mazda RX4         0
Mazda RX4 Wag     0
Datsun 710        1
Hornet 4 Drive    1
Merc 240D         1
Name: vs, dtype: int64, H_minus=0.918, H_plus=0.971, H_split=0.951
    t=21.2, S_minus=Mazda RX4            0
Mazda RX4 Wag        0
Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, S_plus=Datsun 710        1
Hornet 4 Drive    1
Merc 240D         1
Name: vs, dtype: int64, H_minus=0.722, H_plus=0.0, H_split=0.451
    t=22.1, S_minus=Mazda RX4            0
Mazda RX4 Wag        0
Hornet 4 Drive       1
Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, S_plus=Datsun 710    1
Merc 240D     1
Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.689
    t=23.6, S_minus=Mazda RX4            0
Mazda RX4 Wag        0
Datsun 710           1
Hornet 4 Drive       1
Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, S_plus=Merc 240D    1
Name: vs, dtype: int64, H_minus=0.985, H_plus=0.0, H_split=0.862
feature_names[1]=cyl
  unique    =[4 6 8]
  thresholds=[5. 7.]
    t=5.0, S_minus=Datsun 710    1
Merc 240D     1
Name: vs, dtype: int64, S_plus=Mazda RX4            0
Mazda RX4 Wag        0
Hornet 4 Drive       1
Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, H_minus=0.0, H_plus=0.918, H_split=0.689
    t=7.0, S_minus=Mazda RX4         0
Mazda RX4 Wag     0
Datsun 710        1
Hornet 4 Drive    1
Valiant           1
Merc 240D         1
Name: vs, dtype: int64, S_plus=Hornet Sportabout    0
Duster 360           0
Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.689
########## best_j=0, best_feature=mpg, best_t=21.2, min_H=0.451
  decision_tree(X=------------------------------------------------------------
                      mpg  cyl
Mazda RX4          21.0    6
Mazda RX4 Wag      21.0    6
Hornet Sportabout  18.7    8
Valiant            18.1    6
Duster 360         14.3    8, y=Mazda RX4            0
Mazda RX4 Wag        0
Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, feature_names=['mpg', 'cyl']
feature_names[0]=mpg
  unique    =[14.3 18.1 18.7 21. ]
  thresholds=[16.2  18.4  19.85]
    t=16.2, S_minus=Duster 360    0
Name: vs, dtype: int64, S_plus=Mazda RX4            0
Mazda RX4 Wag        0
Hornet Sportabout    0
Valiant              1
Name: vs, dtype: int64, H_minus=0.0, H_plus=0.811, H_split=0.649
    t=18.4, S_minus=Valiant       1
Duster 360    0
Name: vs, dtype: int64, S_plus=Mazda RX4            0
Mazda RX4 Wag        0
Hornet Sportabout    0
Name: vs, dtype: int64, H_minus=1.0, H_plus=0.0, H_split=0.4
    t=19.9, S_minus=Hornet Sportabout    0
Valiant              1
Duster 360           0
Name: vs, dtype: int64, S_plus=Mazda RX4        0
Mazda RX4 Wag    0
Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.551
feature_names[1]=cyl
  unique    =[6 8]
  thresholds=[7.]
    t=7.0, S_minus=Mazda RX4        0
Mazda RX4 Wag    0
Valiant          1
Name: vs, dtype: int64, S_plus=Hornet Sportabout    0
Duster 360           0
Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.551
  ########## best_j=0, best_feature=mpg, best_t=18.4, min_H=0.4
    decision_tree(X=------------------------------------------------------------
                 mpg  cyl
Valiant     18.1    6
Duster 360  14.3    8, y=Valiant       1
Duster 360    0
Name: vs, dtype: int64, feature_names=['mpg', 'cyl']
feature_names[0]=mpg
  unique    =[14.3 18.1]
  thresholds=[16.2]
    t=16.2, S_minus=Duster 360    0
Name: vs, dtype: int64, S_plus=Valiant    1
Name: vs, dtype: int64, H_minus=0.0, H_plus=0.0, H_split=0.0
feature_names[1]=cyl
  unique    =[6 8]
  thresholds=[7.]
    t=7.0, S_minus=Valiant    1
Name: vs, dtype: int64, S_plus=Duster 360    0
Name: vs, dtype: int64, H_minus=0.0, H_plus=0.0, H_split=0.0
    ########## best_j=0, best_feature=mpg, best_t=16.2, min_H=0.0


df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv', index_col=0)
feature_names = ['mpg', 'cyl']
X = df[feature_names]
y = df.vs
class_names = ['V', 'straight']

clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)
clf.fit(X, y)

plt.figure(figsize=(8, 8)) # (width, height) in inches                                                             
tree.plot_tree(clf, feature_names=feature_names, class_names=class_names, filled=True)
plt.title('Classify cars from mtcars as 0=V or 1=straight engine\n' +
          'from mpg and cyl (so y is vs and X includes mpg and cyl)\n')
plt.savefig(fname='mtcarsDecision.png')
plt.show(block=False)
print(export_text(clf, feature_names=feature_names))
print(f'Accuracy on training data is clf.score(X, y)={clf.score(X, y)}.')

|--- cyl <= 7.00
|   |--- mpg <= 21.20
|   |   |--- mpg <= 19.45
|   |   |   |--- class: 1
|   |   |--- mpg >  19.45
|   |   |   |--- class: 0
|   |--- mpg >  21.20
|   |   |--- mpg <= 25.20
|   |   |   |--- class: 1
|   |   |--- mpg >  25.20
|   |   |   |--- mpg <= 26.65
|   |   |   |   |--- class: 0
|   |   |   |--- mpg >  26.65
|   |   |   |   |--- class: 1
|--- cyl >  7.00
|   |--- class: 0

Accuracy on training data is clf.score(X, y)=1.0.


decision_tree(X, y, feature_names=feature_names) # check that my implementation gives the same tree

########## best_j=1, best_feature=cyl, best_t=7.0, min_H=0.43
  ########## best_j=0, best_feature=mpg, best_t=21.2, min_H=0.609
    ########## best_j=0, best_feature=mpg, best_t=19.4, min_H=0.0
    ########## best_j=0, best_feature=mpg, best_t=25.2, min_H=0.325
      ########## best_j=0, best_feature=mpg, best_t=26.6, min_H=0.0

Plot entropy for coin flip (Bernoulli trial) with probability p = P(heads) = P(success).¶

Small example to start by hand:¶

Students: You do not have to learn the following code¶

Define functions H(S), and H_weighted(S_minus, S_plus)¶

Define function split(X, y, feature_names)¶

Define recursive function decision_tree(X, y, feature_names, depth=0, debug=False)¶

Make a tree from all 32 rows of mtcars (with scikit-learn again):¶

	mpg	cyl	vs
Mazda RX4	21.0	6	0
Mazda RX4 Wag	21.0	6	0
Datsun 710	22.8	4	1
Hornet 4 Drive	21.4	6	1
Hornet Sportabout	18.7	8	0
Valiant	18.1	6	1
Duster 360	14.3	8	0
Merc 240D	24.4	4	1