import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
accuracy_score, roc_auc_score, roc_curve, RocCurveDisplay)
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree # for tree.plot_tree()
from sklearn.model_selection import (train_test_split, cross_val_score,
GridSearchCV, RandomizedSearchCV)
from sklearn.datasets import load_wine
from scipy.stats import uniform
# make fake data: 4 TN, 2 FP, 1 FN, 3 TP
y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1])
y_hat = np.array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1])
M = confusion_matrix(y_true=y, y_pred=y_hat)
# to include row and column labels, put matrix in DataFrame
df = pd.DataFrame(data=M) # or pd.DataFrame(data=M, index=['0', '1'], columns=['0', '1'])
print(f'df:\n{df}')
df: 0 1 0 4 2 1 1 3
# peel off component counts from confusion matrix
TN, FP, FN, TP = M.ravel() # .ravel() returns a vector from a matrix
print(f'TN={TN}, FP={FP}, FN={FN}, TP={TP}')
TN=4, FP=2, FN=1, TP=3
precision = precision_score(y_true=y, y_pred=y_hat)
recall = recall_score( y_true=y, y_pred=y_hat)
accuracy = accuracy_score(y_true=y, y_pred=y_hat)
print(f'precision={precision}, recall={recall}, accuracy={accuracy}')
precision=0.6, recall=0.75, accuracy=0.7
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
print(f'TPR={TPR:.3}, FPR={FPR:.3}')
TPR=0.75, FPR=0.333
I had to upgrade scikit-learn to make RocCurveDisplay.from_estimator() work, below.
# !pip show scikit-learn # check version; I think 0.24.2 fails
# !pip install -U scikit-learn # upgrade version to 1.0.2 or newer.
The wine data are described here: wine-dataset.
# get real data for classifying wine
wine = load_wine()
X = wine.data
y = wine.target
y = (y == 2).astype(int) # the data have y in {0, 1, 2}: go binary by maping 0 and 1 to 0, 2 to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# train SVM on training data; make ROC curve on test data
clf = svm.SVC(random_state=0)
clf.fit(X_train, y_train)
RocCurveDisplay.from_estimator(clf, X_test, y_test)
plt.plot([0, 1], [0, 1], ':k', label='merely guessing') # add diagonal line
plt.title('ROC curve for classifying wine with SVM')
plt.legend()
plt.show(block=False)
accuracy = clf.score(X_test, y_test)
print(f'Accuracy on test data is {accuracy:.3}.')
print(f'Or, via accuracy_score(), it is {accuracy_score(y_test, clf.predict(X_test)):.3}.')
Accuracy on test data is 0.822. Or, via accuracy_score(), it is 0.822.
I am not excited about the ROC curve above, as I cannot find a satisfactory TPR/FPR combination.
Let's try another classifier.
# train a decision tree on wine to see if it has better ROC curve
clf = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
clf.fit(X_train, y_train)
RocCurveDisplay.from_estimator(clf, X_test, y_test)
plt.plot([0, 1], [0, 1], ':k', label='merely guessing')
plt.title('ROC curve for classifying wine with decision tree')
plt.legend()
plt.show()
The decision tree isn't so bad, as we can have TPR$\approx$0.9 with FPR$\approx$0.1.
# calculate AUC without the help of RocCurveDisplay.from_estimator()
probability_y_is_1 = clf.predict_proba(X_test)[:, 1]
accuracy = clf.score(X_test, y_test)
auc = roc_auc_score(y_true=y_test, y_score=probability_y_is_1)
print(f'Accuracy on test data is {accuracy:.3}')
print(f'Area under ROC curve on test data is {auc:.3}.')
Accuracy on test data is 0.911 Area under ROC curve on test data is 0.897.
# look at tree--why does it do better than SVM?
tree.plot_tree(clf, feature_names=wine.feature_names, class_names=wine.target_names)
plt.show(block=False)
probability_y_is_1 = clf.predict_proba(X_test)[:, 1]
RocCurveDisplay.from_predictions(y_true=y_test, y_pred=probability_y_is_1)
plt.show(block=False)
FPR, TPR, thresholds = roc_curve(y_true=y_test, y_score=probability_y_is_1)
with np.printoptions(precision=3): # set precision for this block only
print(f'FPR={FPR},\nTPR={TPR},\nthresholds={thresholds}') # ignore thresholds[0]
RocCurveDisplay.from_predictions(y_true=y_test, y_pred=probability_y_is_1)
plt.plot(FPR, TPR, 'or', label='from roc_curve()')
plt.legend()
plt.show(block=False)
FPR=[0. 0.081 1. ], TPR=[0. 0.875 1. ], thresholds=[1.905 0.905 0.022]
left_leaf_prob = 38 / (4 + 38) # these should be the decision thresholds
right_leaf_prob = 2 / (89 + 2)
print(f'left_leaf_prob={left_leaf_prob:.3}, right_leaf_prob={right_leaf_prob:.3},\n' +
f'thresholds[1:]={thresholds[1:]}')
# now let's find TN, FP, FN, and TP and then FPR and TPR for the first threshold
predicted_y_hat = (probability_y_is_1 >= thresholds[1]).astype(int)
TN = np.sum((y_test == 0) & (predicted_y_hat == 0))
FP = np.sum((y_test == 0) & (predicted_y_hat == 1))
FN = np.sum((y_test == 1) & (predicted_y_hat == 0))
TP = np.sum((y_test == 1) & (predicted_y_hat == 1))
print(f'TN={TN}, FP={FP}, FN={FN}, TP={TP}')
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)
print(f'FPR={FPR:.3}, TPR={TPR:.3}')
left_leaf_prob=0.905, right_leaf_prob=0.022, thresholds[1:]=[0.9047619 0.02197802] TN=34, FP=3, FN=1, TP=7 FPR=0.0811, TPR=0.875
print(f'Training accuracy is {clf.score(X_train, y_train):.3}.')
scores = cross_val_score(clf, X, y)
with np.printoptions(precision=2): # set precision for this block only
print(f'Cross-validation scores={scores}, mean={np.mean(scores):.3}')
print(f'Test accuracy is {clf.score(X_test, y_test):.3}.')
Training accuracy is 0.955. Cross-validation scores=[0.89 0.94 0.78 0.83 0.97], mean=0.882 Test accuracy is 0.911.
Notice that training accuracy over-estimated test accuracy, while the cross-validation accuracy is a better estimate.
# reuse concentric circles data from SVM kernel trick demo; first display data
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/circles.csv')
X = df[['x0', 'x1']]
y = df.y
plt.plot(df.x0[y == 0], df.x1[y == 0], '.r', label='0')
plt.plot(df.x0[y == 1], df.x1[y == 1], '.b', label='1')
plt.show(block=False)
# do grid search to find which kernel/C combination works best
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 1000]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X, y)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
f'clf.best_params_={clf.best_params_}')
clf.best_score_=1.0, clf.best_params_={'C': 1, 'kernel': 'rbf'}
# inspect tuning results in more detail
print(pd.DataFrame(clf.cv_results_))
mean_fit_time std_fit_time mean_score_time std_score_time param_C \ 0 0.002284 0.000531 0.001400 0.000098 1 1 0.001634 0.000035 0.001228 0.000045 1 2 0.051756 0.014846 0.001903 0.000046 1000 3 0.001761 0.000164 0.001236 0.000050 1000 param_kernel params split0_test_score \ 0 linear {'C': 1, 'kernel': 'linear'} 0.5 1 rbf {'C': 1, 'kernel': 'rbf'} 1.0 2 linear {'C': 1000, 'kernel': 'linear'} 0.5 3 rbf {'C': 1000, 'kernel': 'rbf'} 1.0 split1_test_score split2_test_score split3_test_score split4_test_score \ 0 0.5 0.500 0.5 0.5 1 1.0 1.000 1.0 1.0 2 0.5 0.625 0.5 0.5 3 1.0 1.000 1.0 1.0 mean_test_score std_test_score rank_test_score 0 0.500 0.00 4 1 1.000 0.00 1 2 0.525 0.05 3 3 1.000 0.00 1
# repeat "inspect" line above, this time printing only most important columns
print(pd.DataFrame(clf.cv_results_).iloc[:, [6, 12]].
sort_values(by='mean_test_score', axis=0, ascending=False))
params mean_test_score 1 {'C': 1, 'kernel': 'rbf'} 1.000 3 {'C': 1000, 'kernel': 'rbf'} 1.000 2 {'C': 1000, 'kernel': 'linear'} 0.525 0 {'C': 1, 'kernel': 'linear'} 0.500
From the mean_test_score column, it looks like either C value worked perfectly for kernel='rbf'.
rng = np.random.default_rng(seed=0)
distributions = {
'kernel': ('linear', 'rbf'),
'C': uniform(loc=0, scale=1000) # uniform[loc, loc + scale]
}
clf = RandomizedSearchCV(svc, param_distributions=distributions, n_iter=10)
clf.fit(X, y)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
f'clf.best_params_={clf.best_params_}')
clf.best_score_=1.0, clf.best_params_={'C': 824.2823906849003, 'kernel': 'rbf'}