import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
                             accuracy_score, roc_auc_score, roc_curve, RocCurveDisplay)
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree # for tree.plot_tree()
from sklearn.model_selection import (train_test_split, cross_val_score, 
                                     GridSearchCV, RandomizedSearchCV)

from sklearn.datasets import load_wine

from scipy.stats import uniform


# make fake data:       4 TN,   2 FP,   1 FN,      3 TP
y     = np.array([0, 0, 0, 0,   0, 0,      1,   1, 1, 1])
y_hat = np.array([0, 0, 0, 0,   1, 1,      0,   1, 1, 1])
M = confusion_matrix(y_true=y, y_pred=y_hat)
# to include row and column labels, put matrix in DataFrame
df = pd.DataFrame(data=M) # or pd.DataFrame(data=M, index=['0', '1'], columns=['0', '1'])
print(f'df:\n{df}')

df:
   0  1
0  4  2
1  1  3


# peel off component counts from confusion matrix
TN, FP, FN, TP = M.ravel() # .ravel() returns a vector from a matrix
print(f'TN={TN}, FP={FP}, FN={FN}, TP={TP}')

TN=4, FP=2, FN=1, TP=3


precision = precision_score(y_true=y, y_pred=y_hat)
recall = recall_score(   y_true=y, y_pred=y_hat)
accuracy = accuracy_score(y_true=y, y_pred=y_hat)
print(f'precision={precision}, recall={recall}, accuracy={accuracy}')

precision=0.6, recall=0.75, accuracy=0.7


TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
print(f'TPR={TPR:.3}, FPR={FPR:.3}')

TPR=0.75, FPR=0.333


# !pip show scikit-learn # check version; I think 0.24.2 fails
# !pip install -U scikit-learn # upgrade version to 1.0.2 or newer.


# get real data for classifying wine
wine = load_wine()
X = wine.data
y = wine.target
y = (y == 2).astype(int) # the data have y in {0, 1, 2}: go binary by maping 0 and 1 to 0, 2 to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# train SVM on training data; make ROC curve on test data
clf = svm.SVC(random_state=0)
clf.fit(X_train, y_train)
RocCurveDisplay.from_estimator(clf, X_test, y_test)
plt.plot([0, 1], [0, 1], ':k', label='merely guessing') # add diagonal line
plt.title('ROC curve for classifying wine with SVM')
plt.legend()
plt.show(block=False)
accuracy = clf.score(X_test, y_test)
print(f'Accuracy on test data is {accuracy:.3}.')
print(f'Or, via accuracy_score(), it is {accuracy_score(y_test, clf.predict(X_test)):.3}.')

Accuracy on test data is 0.822.
Or, via accuracy_score(), it is 0.822.


# train a decision tree on wine to see if it has better ROC curve
clf = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
clf.fit(X_train, y_train)
RocCurveDisplay.from_estimator(clf, X_test, y_test)
plt.plot([0, 1], [0, 1], ':k', label='merely guessing')
plt.title('ROC curve for classifying wine with decision tree')
plt.legend()
plt.show()


# calculate AUC without the help of RocCurveDisplay.from_estimator()
probability_y_is_1 = clf.predict_proba(X_test)[:, 1]
accuracy = clf.score(X_test, y_test)
auc = roc_auc_score(y_true=y_test, y_score=probability_y_is_1)
print(f'Accuracy on test data is {accuracy:.3}')
print(f'Area under ROC curve on test data is {auc:.3}.')

Accuracy on test data is 0.911
Area under ROC curve on test data is 0.897.


# look at tree--why does it do better than SVM?
tree.plot_tree(clf, feature_names=wine.feature_names, class_names=wine.target_names)
plt.show(block=False)


probability_y_is_1 = clf.predict_proba(X_test)[:, 1]
RocCurveDisplay.from_predictions(y_true=y_test, y_pred=probability_y_is_1)
plt.show(block=False)


FPR, TPR, thresholds = roc_curve(y_true=y_test, y_score=probability_y_is_1)
with np.printoptions(precision=3): # set precision for this block only
    print(f'FPR={FPR},\nTPR={TPR},\nthresholds={thresholds}') # ignore thresholds[0]

RocCurveDisplay.from_predictions(y_true=y_test, y_pred=probability_y_is_1)
plt.plot(FPR, TPR, 'or', label='from roc_curve()')
plt.legend()
plt.show(block=False)

FPR=[0.    0.081 1.   ],
TPR=[0.    0.875 1.   ],
thresholds=[1.905 0.905 0.022]


left_leaf_prob = 38 / (4 + 38) # these should be the decision thresholds
right_leaf_prob = 2 / (89 + 2)
print(f'left_leaf_prob={left_leaf_prob:.3}, right_leaf_prob={right_leaf_prob:.3},\n' +
      f'thresholds[1:]={thresholds[1:]}')
# now let's find TN, FP, FN, and TP and then FPR and TPR for the first threshold
predicted_y_hat = (probability_y_is_1 >= thresholds[1]).astype(int)
TN = np.sum((y_test == 0) & (predicted_y_hat == 0))
FP = np.sum((y_test == 0) & (predicted_y_hat == 1))
FN = np.sum((y_test == 1) & (predicted_y_hat == 0))
TP = np.sum((y_test == 1) & (predicted_y_hat == 1))
print(f'TN={TN}, FP={FP}, FN={FN}, TP={TP}')
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)
print(f'FPR={FPR:.3}, TPR={TPR:.3}')

left_leaf_prob=0.905, right_leaf_prob=0.022,
thresholds[1:]=[0.9047619  0.02197802]
TN=34, FP=3, FN=1, TP=7
FPR=0.0811, TPR=0.875


print(f'Training accuracy is {clf.score(X_train, y_train):.3}.')
scores = cross_val_score(clf, X, y)
with np.printoptions(precision=2): # set precision for this block only
    print(f'Cross-validation scores={scores}, mean={np.mean(scores):.3}')
print(f'Test accuracy is {clf.score(X_test, y_test):.3}.')

Training accuracy is 0.955.
Cross-validation scores=[0.89 0.94 0.78 0.83 0.97], mean=0.882
Test accuracy is 0.911.


# reuse concentric circles data from SVM kernel trick demo; first display data
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/circles.csv')
X = df[['x0', 'x1']]
y = df.y
plt.plot(df.x0[y == 0], df.x1[y == 0], '.r', label='0')
plt.plot(df.x0[y == 1], df.x1[y == 1], '.b', label='1')
plt.show(block=False)


# do grid search to find which kernel/C combination works best
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 1000]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X, y)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}')

clf.best_score_=1.0, clf.best_params_={'C': 1, 'kernel': 'rbf'}


# inspect tuning results in more detail
print(pd.DataFrame(clf.cv_results_))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0       0.002284      0.000531         0.001400        0.000098       1   
1       0.001634      0.000035         0.001228        0.000045       1   
2       0.051756      0.014846         0.001903        0.000046    1000   
3       0.001761      0.000164         0.001236        0.000050    1000   

  param_kernel                           params  split0_test_score  \
0       linear     {'C': 1, 'kernel': 'linear'}                0.5   
1          rbf        {'C': 1, 'kernel': 'rbf'}                1.0   
2       linear  {'C': 1000, 'kernel': 'linear'}                0.5   
3          rbf     {'C': 1000, 'kernel': 'rbf'}                1.0   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0                0.5              0.500                0.5                0.5   
1                1.0              1.000                1.0                1.0   
2                0.5              0.625                0.5                0.5   
3                1.0              1.000                1.0                1.0   

   mean_test_score  std_test_score  rank_test_score  
0            0.500            0.00                4  
1            1.000            0.00                1  
2            0.525            0.05                3  
3            1.000            0.00                1


# repeat "inspect" line above, this time printing only most important columns
print(pd.DataFrame(clf.cv_results_).iloc[:, [6, 12]].
      sort_values(by='mean_test_score', axis=0, ascending=False))

                            params  mean_test_score
1        {'C': 1, 'kernel': 'rbf'}            1.000
3     {'C': 1000, 'kernel': 'rbf'}            1.000
2  {'C': 1000, 'kernel': 'linear'}            0.525
0     {'C': 1, 'kernel': 'linear'}            0.500


rng = np.random.default_rng(seed=0)
distributions = {
    'kernel': ('linear', 'rbf'),
    'C': uniform(loc=0, scale=1000) # uniform[loc, loc + scale]
}
clf = RandomizedSearchCV(svc, param_distributions=distributions, n_iter=10)
clf.fit(X, y)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}')

clf.best_score_=1.0, clf.best_params_={'C': 824.2823906849003, 'kernel': 'rbf'}

Model Performance Assessment, Hyperparameter Tuning, Cross-Validation¶

Model Performance Assessment¶

Confusion Matrix¶

Precision, Recall, Accuracy¶

Area under ROC Curve (AUC)¶

Preface: True Positive Rate, False Positive Rate (on data above)¶

(Upgrade scikit-learn?)¶

Now use RocCurveDisplay.from_predictions() to confirm we get the same plot RocCurveDisplay.from_estimator() made.¶

Now, for understanding, repeat the last plot, but show that the points on the curve came from roc_curve():¶

And let's confirm that we can do the calculations done by roc_curve():¶

Cross-Validation¶

Hyperparameter Tuning¶

Grid search¶

Random search¶