import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from scipy.stats import uniform

import time
import os


y = np.array([0, 0, 0, 0, 0, 1]) # 5 zeros, 1 one
N = y.shape # 6
counts = np.bincount(y) # array([5, 1])
n_classes = counts.shape # 2
C_1, C_2 = N / (n_classes * counts) # 0.6, 3
print(f'counts={counts}, C_1={C_1}, C_2={C_2}')

counts=[5 1], C_1=0.6, C_2=3.0


# import sys
# !{sys.executable} -m pip install imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

X = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1)
y = np.array([0, 0, 0, 0, 1, 1])
rs = RandomOverSampler()
X_resampled, y_resampled = rs.fit_resample(X, y)
print(f'Oversampling: X_resampled={X_resampled},\ny_resampled={y_resampled}')
rs = RandomUnderSampler()
X_resampled, y_resampled = rs.fit_resample(X, y)
print(f'Undersampling: X_resampled={X_resampled},\ny_resampled={y_resampled}')

Oversampling: X_resampled=[[1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [6]
 [5]],
y_resampled=[0 0 0 0 1 1 1 1]
Undersampling: X_resampled=[[1]
 [4]
 [5]
 [6]],
y_resampled=[0 0 1 1]


digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)


clf = svm.SVC(kernel="linear", C=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.98


clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8066666666666666


clf = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9822222222222222


clf = LogisticRegression(max_iter=3000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

/u/j/g/jgillett/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

0.9644444444444444


estimators = [
    ('SVM', svm.SVC(kernel="linear", C=1000)),
    ('kNN', KNeighborsClassifier(n_neighbors=1, metric='euclidean')),
    ('DecisionTree', DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(max_iter=3000)
)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9888888888888889


# inspect components of clf:
print(f'clf.estimators_={clf.estimators_}')
print(f'clf.final_estimator_={clf.final_estimator_}')
print(f'clf.stack_method_={clf.stack_method_}')

clf.estimators_=[SVC(C=1000, kernel='linear'), KNeighborsClassifier(metric='euclidean', n_neighbors=1), DecisionTreeClassifier(criterion='entropy', random_state=0)]
clf.final_estimator_=LogisticRegression(max_iter=3000)
clf.stack_method_=['decision_function', 'predict_proba', 'predict_proba']


# reproduce the StackingClassifier()'s output without using StackingClassifier(),
# directly from the stacking description in the lecture notes:
np.set_printoptions(precision=3)

y1_hat = clf.estimators_[0].decision_function(X_test)
print(f'y1_hat{y1_hat.shape}={y1_hat}')

y2_hat = clf.estimators_[1].predict_proba(X_test)
print(f'y2_hat{y2_hat.shape}={y2_hat}')

y3_hat = clf.estimators_[2].predict_proba(X_test)
print(f'y3_hat{y3_hat.shape}={y3_hat}')

x_hat = np.column_stack((y1_hat, y2_hat, y3_hat))
print(f'x_hat{x_hat.shape}={x_hat}')
y_hat = clf.final_estimator_.predict(x_hat)
print(f'y_hat{y_hat.shape}={y_hat}')

y_hat_from_clf = clf.predict(X_test)
print(f'y_hat_from_clf{y_hat_from_clf.shape}={y_hat_from_clf}')

assert np.all(y_hat == y_hat_from_clf)

y1_hat(450, 10)=[[ 6.289  1.732  9.307 ...  3.821  4.77   6.239]
 [ 9.312  1.697  2.735 ...  0.703  6.308  5.17 ]
 [ 3.787  5.252 -0.31  ...  7.276  6.306  3.767]
 ...
 [-0.316  9.314  7.279 ...  4.764  7.305  2.711]
 [ 2.785  9.288  0.733 ...  1.715  7.311  4.926]
 [ 1.729  8.295 -0.319 ...  6.264  5.283  9.32 ]]
y2_hat(450, 10)=[[0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
y3_hat(450, 10)=[[0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
x_hat(450, 30)=[[ 6.289  1.732  9.307 ...  0.     0.     0.   ]
 [ 9.312  1.697  2.735 ...  0.     0.     0.   ]
 [ 3.787  5.252 -0.31  ...  0.     0.     0.   ]
 ...
 [-0.316  9.314  7.279 ...  0.     0.     0.   ]
 [ 2.785  9.288  0.733 ...  0.     0.     0.   ]
 [ 1.729  8.295 -0.319 ...  0.     0.     1.   ]]
y_hat(450,)=[2 0 4 9 4 1 2 4 6 7 9 1 8 0 9 8 2 9 7 7 0 2 6 7 2 1 5 7 2 4 3 4 3 6 1 2 4
 8 4 6 2 8 1 8 7 6 5 9 1 0 3 6 8 0 4 5 0 2 5 5 7 8 7 3 2 2 9 3 8 5 3 7 6 6
 3 7 3 7 0 9 6 7 1 6 6 8 2 7 4 3 3 6 3 9 3 4 9 6 6 4 4 0 1 2 9 9 8 3 6 8 1
 5 4 9 2 0 7 9 0 7 1 9 8 2 8 5 7 4 8 0 0 5 0 5 3 7 6 4 7 6 2 0 7 9 3 1 4 6
 8 8 1 6 3 2 3 4 0 4 9 6 0 2 7 2 0 1 4 4 1 0 4 4 1 0 7 2 8 2 5 7 6 3 2 3 8
 6 7 4 3 5 6 5 1 3 4 1 1 6 3 7 8 5 5 3 8 5 3 1 2 3 5 0 7 3 5 0 8 8 6 5 4 4
 9 9 4 4 7 4 1 3 1 5 1 0 9 6 5 9 0 4 5 0 1 7 5 0 0 1 4 5 8 1 9 6 8 2 2 8 5
 3 3 9 3 7 7 3 5 0 2 4 2 9 1 6 7 1 2 7 2 9 7 5 4 6 2 2 3 3 6 0 9 9 8 2 7 1
 5 6 1 7 2 5 3 8 0 8 1 2 0 6 2 1 7 7 1 9 8 6 0 2 4 2 7 7 7 1 5 3 4 3 8 9 2
 1 3 1 4 0 3 8 1 0 0 8 4 6 0 0 4 2 0 3 0 9 5 8 1 9 6 9 7 3 9 6 6 6 3 0 5 2
 6 5 8 6 1 6 9 6 7 8 4 0 7 3 1 1 9 8 5 0 5 0 1 4 5 4 8 4 6 5 7 6 5 4 9 5 2
 1 5 5 8 3 0 2 5 9 8 5 4 2 5 3 8 0 8 5 4 0 9 1 7 7 5 3 3 7 3 6 8 9 5 2 9 3
 1 9 9 1 1 9]
y_hat_from_clf(450,)=[2 0 4 9 4 1 2 4 6 7 9 1 8 0 9 8 2 9 7 7 0 2 6 7 2 1 5 7 2 4 3 4 3 6 1 2 4
 8 4 6 2 8 1 8 7 6 5 9 1 0 3 6 8 0 4 5 0 2 5 5 7 8 7 3 2 2 9 3 8 5 3 7 6 6
 3 7 3 7 0 9 6 7 1 6 6 8 2 7 4 3 3 6 3 9 3 4 9 6 6 4 4 0 1 2 9 9 8 3 6 8 1
 5 4 9 2 0 7 9 0 7 1 9 8 2 8 5 7 4 8 0 0 5 0 5 3 7 6 4 7 6 2 0 7 9 3 1 4 6
 8 8 1 6 3 2 3 4 0 4 9 6 0 2 7 2 0 1 4 4 1 0 4 4 1 0 7 2 8 2 5 7 6 3 2 3 8
 6 7 4 3 5 6 5 1 3 4 1 1 6 3 7 8 5 5 3 8 5 3 1 2 3 5 0 7 3 5 0 8 8 6 5 4 4
 9 9 4 4 7 4 1 3 1 5 1 0 9 6 5 9 0 4 5 0 1 7 5 0 0 1 4 5 8 1 9 6 8 2 2 8 5
 3 3 9 3 7 7 3 5 0 2 4 2 9 1 6 7 1 2 7 2 9 7 5 4 6 2 2 3 3 6 0 9 9 8 2 7 1
 5 6 1 7 2 5 3 8 0 8 1 2 0 6 2 1 7 7 1 9 8 6 0 2 4 2 7 7 7 1 5 3 4 3 8 9 2
 1 3 1 4 0 3 8 1 0 0 8 4 6 0 0 4 2 0 3 0 9 5 8 1 9 6 9 7 3 9 6 6 6 3 0 5 2
 6 5 8 6 1 6 9 6 7 8 4 0 7 3 1 1 9 8 5 0 5 0 1 4 5 4 8 4 6 5 7 6 5 4 9 5 2
 1 5 5 8 3 0 2 5 9 8 5 4 2 5 3 8 0 8 5 4 0 9 1 7 7 5 3 3 7 3 6 8 9 5 2 9 3
 1 9 9 1 1 9]


sample_sizes = 1000000 * (1 + np.arange(10))
n_sizes = len(sample_sizes)
times = np.zeros(shape=n_sizes)
rng = np.random.default_rng(seed=0)
for i in np.arange(n_sizes):
    N = sample_sizes[i]
    a = np.random.sample(size=N) # random floats from [0.0, 1.0)
    start = time.time()
    discard = np.sort(a) # how fast is this algorithm?"
    end = time.time()
    times[i] = end - start
    print(f'i={i}, N={N}, times[i]={times[i]}')

i=0, N=1000000, times[i]=0.10794925689697266
i=1, N=2000000, times[i]=0.21044659614562988
i=2, N=3000000, times[i]=0.32604074478149414
i=3, N=4000000, times[i]=0.44252443313598633
i=4, N=5000000, times[i]=0.5667192935943604
i=5, N=6000000, times[i]=0.6878054141998291
i=6, N=7000000, times[i]=0.7996289730072021
i=7, N=8000000, times[i]=0.913365364074707
i=8, N=9000000, times[i]=1.0328953266143799
i=9, N=10000000, times[i]=1.1809577941894531


plt.plot(sample_sizes, times, '.')
plt.title('Run time vs. array size')
plt.xlabel('array size N')
plt.ylabel('time (seconds)')

Text(0, 0.5, 'time (seconds)')


n_CPU = os.cpu_count()
print(f'n_CPU={n_CPU}.')

df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/circles.csv')
X = df[['x0', 'x1']]
y = df['y']

rng = np.random.default_rng(seed=0)
distributions = {
    'kernel': ('linear', 'rbf'),
    'C': uniform(loc=0, scale=1000) # uniform[loc, loc + scale]
}

times = np.zeros(shape=n_CPU+1) # ignore position [0]
numbers_of_CPUs = 1 + np.arange(start=0, stop=n_CPU)
for n_jobs in numbers_of_CPUs:
  clf = RandomizedSearchCV(svm.SVC(), param_distributions=distributions, n_iter=100, n_jobs=n_jobs)
  start = time.time()
  clf.fit(X, y)
  end = time.time()
  print(f'clf.best_score_={clf.best_score_:.3}, ' + f'clf.best_params_={clf.best_params_}')
  times[n_jobs] = end - start
  print(f'n_jobs={n_jobs}, clf.fit() took {times[n_jobs]:.3} seconds.')

plt.plot(numbers_of_CPUs, times[1:], 'or')
plt.title('RandomizedSearchCV() time vs. n_jobs')
plt.xticks(ticks=np.append(0, numbers_of_CPUs))
plt.xlabel('n_jobs')
plt.ylabel('time (seconds)')
plt.ylim(0, 1.1*np.max(times))
plt.show()

n_CPU=4.
clf.best_score_=1.0, clf.best_params_={'C': 362.23453815527705, 'kernel': 'rbf'}
n_jobs=1, clf.fit() took 9.17 seconds.
clf.best_score_=1.0, clf.best_params_={'C': 419.12421309229984, 'kernel': 'rbf'}
n_jobs=2, clf.fit() took 4.75 seconds.
clf.best_score_=1.0, clf.best_params_={'C': 94.70976501706862, 'kernel': 'rbf'}
n_jobs=3, clf.fit() took 3.58 seconds.
clf.best_score_=1.0, clf.best_params_={'C': 269.9760747242911, 'kernel': 'rbf'}
n_jobs=4, clf.fit() took 2.38 seconds.

Imbalance, Stacking, Timing, and Multicore¶

Imbalance: see weights used by svm.SVC()'s class_weight='balanced'¶

Imbalance: oversampling, undersampling¶

Stacking¶

First check classifiers individually.¶

Try stacking the first three, using the fourth as the final estimator.¶

See if we can understand what the StackingClassifier() did.¶

Algorithm Efficiency and Timing Code¶

Multicore computing¶