Imbalance, Stacking, Timing, and Multicore¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from scipy.stats import uniform

import time
import os

Imbalance: see weights used by svm.SVC()'s class_weight='balanced'¶

In [2]:
y = np.array([0, 0, 0, 0, 0, 1]) # 5 zeros, 1 one
N = y.shape # 6
counts = np.bincount(y) # array([5, 1])
n_classes = counts.shape # 2
C_1, C_2 = N / (n_classes * counts) # 0.6, 3
print(f'counts={counts}, C_1={C_1}, C_2={C_2}')
counts=[5 1], C_1=0.6, C_2=3.0

Imbalance: oversampling, undersampling¶

In [3]:
# Do 'New > Terminal' and then run 'conda install -c conda-forge imbalanced-learn' to install package.
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

X = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1)
y = np.array([0, 0, 0, 0, 1, 1])
rs = RandomOverSampler()
X_resampled, y_resampled = rs.fit_resample(X, y)
print(f'Oversampling: X_resampled={X_resampled},\ny_resampled={y_resampled}')
rs = RandomUnderSampler()
X_resampled, y_resampled = rs.fit_resample(X, y)
print(f'Undersampling: X_resampled={X_resampled},\ny_resampled={y_resampled}')
Oversampling: X_resampled=[[1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [5]
 [5]],
y_resampled=[0 0 0 0 1 1 1 1]
Undersampling: X_resampled=[[1]
 [2]
 [5]
 [6]],
y_resampled=[0 0 1 1]

Stacking¶

In [4]:
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

First check classifiers individually.¶

In [5]:
clf = svm.SVC(kernel="linear", C=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[5]:
0.98
In [6]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[6]:
0.8066666666666666
In [7]:
clf = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[7]:
0.9822222222222222
In [8]:
clf = LogisticRegression(max_iter=3000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
/home/jgillett/.local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[8]:
0.9644444444444444

Try stacking the first three, using the fourth as the final estimator.¶

In [9]:
estimators = [
    ('SVM', svm.SVC(kernel="linear", C=1000)),
    ('kNN', KNeighborsClassifier(n_neighbors=1, metric='euclidean')),
    ('DecisionTree', DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(max_iter=3000)
)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[9]:
0.9888888888888889

See if we can understand what the StackingClassifier() did.¶

In [10]:
# inspect components of clf:
print(f'clf.estimators_={clf.estimators_}')
print(f'clf.final_estimator_={clf.final_estimator_}')
print(f'clf.stack_method_={clf.stack_method_}')
clf.estimators_=[SVC(C=1000, kernel='linear'), KNeighborsClassifier(metric='euclidean', n_neighbors=1), DecisionTreeClassifier(criterion='entropy', random_state=0)]
clf.final_estimator_=LogisticRegression(max_iter=3000)
clf.stack_method_=['decision_function', 'predict_proba', 'predict_proba']
In [11]:
# reproduce the StackingClassifier()'s output without using StackingClassifier(),
# directly from the stacking description in the lecture notes:
np.set_printoptions(precision=3)

y1_hat = clf.estimators_[0].decision_function(X_test)
print(f'y1_hat{y1_hat.shape}={y1_hat}')

y2_hat = clf.estimators_[1].predict_proba(X_test)
print(f'y2_hat{y2_hat.shape}={y2_hat}')

y3_hat = clf.estimators_[2].predict_proba(X_test)
print(f'y3_hat{y3_hat.shape}={y3_hat}')

x_hat = np.column_stack((y1_hat, y2_hat, y3_hat))
print(f'x_hat{x_hat.shape}={x_hat}')
y_hat = clf.final_estimator_.predict(x_hat)
print(f'y_hat{y_hat.shape}={y_hat}')

y_hat_from_clf = clf.predict(X_test)
print(f'y_hat_from_clf{y_hat_from_clf.shape}={y_hat_from_clf}')

assert np.all(y_hat == y_hat_from_clf)
y1_hat(450, 10)=[[ 6.289  1.732  9.307 ...  3.821  4.77   6.239]
 [ 9.312  1.697  2.735 ...  0.703  6.308  5.17 ]
 [ 3.787  5.252 -0.31  ...  7.276  6.306  3.767]
 ...
 [-0.316  9.314  7.279 ...  4.764  7.305  2.711]
 [ 2.785  9.288  0.733 ...  1.715  7.311  4.926]
 [ 1.729  8.295 -0.319 ...  6.264  5.283  9.32 ]]
y2_hat(450, 10)=[[0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
y3_hat(450, 10)=[[0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
x_hat(450, 30)=[[ 6.289  1.732  9.307 ...  0.     0.     0.   ]
 [ 9.312  1.697  2.735 ...  0.     0.     0.   ]
 [ 3.787  5.252 -0.31  ...  0.     0.     0.   ]
 ...
 [-0.316  9.314  7.279 ...  0.     0.     0.   ]
 [ 2.785  9.288  0.733 ...  0.     0.     0.   ]
 [ 1.729  8.295 -0.319 ...  0.     0.     1.   ]]
y_hat(450,)=[2 0 4 9 4 1 2 4 6 7 9 1 8 0 9 8 2 9 7 7 0 2 6 7 2 1 5 7 2 4 3 4 3 6 1 2 4
 8 4 6 2 8 1 8 7 6 5 9 1 0 3 6 8 0 4 5 0 2 5 5 7 8 7 3 2 2 9 3 8 5 3 7 6 6
 3 7 3 7 0 9 6 7 1 6 6 8 2 7 4 3 3 6 3 9 3 4 9 6 6 4 4 0 1 2 9 9 8 3 6 8 1
 5 4 9 2 0 7 9 0 7 1 9 8 2 8 5 7 4 8 0 0 5 0 5 3 7 6 4 7 6 2 0 7 9 3 1 4 6
 8 8 1 6 3 2 3 4 0 4 9 6 0 2 7 2 0 1 4 4 1 0 4 4 1 0 7 2 8 2 5 7 6 3 2 3 8
 6 7 4 3 5 6 5 1 3 4 1 1 6 3 7 8 5 5 3 8 5 3 1 2 3 5 0 7 3 5 0 8 8 6 5 4 4
 9 9 4 4 7 4 1 3 1 5 1 0 9 6 5 9 0 4 5 0 1 7 5 0 0 1 4 5 8 1 9 6 8 2 2 8 5
 3 3 9 3 7 7 3 5 0 2 4 2 9 1 6 7 1 2 7 2 9 7 5 4 6 2 2 3 3 6 0 9 9 8 2 7 1
 5 6 1 7 2 5 3 8 0 8 1 2 0 6 2 1 7 7 1 9 8 6 0 2 4 2 7 7 7 1 5 3 4 3 8 9 2
 1 3 1 4 0 3 8 1 0 0 8 4 6 0 0 4 2 0 3 0 9 5 8 1 9 6 9 7 3 9 6 6 6 3 0 5 2
 6 5 8 6 1 6 9 6 7 8 4 0 7 3 1 1 9 8 5 0 5 0 1 4 5 4 8 4 6 5 7 6 5 4 9 5 2
 1 5 5 8 3 0 2 5 9 8 5 4 2 5 3 8 0 8 5 4 0 9 1 7 7 5 3 3 7 3 6 8 9 5 2 9 3
 1 9 9 1 1 9]
y_hat_from_clf(450,)=[2 0 4 9 4 1 2 4 6 7 9 1 8 0 9 8 2 9 7 7 0 2 6 7 2 1 5 7 2 4 3 4 3 6 1 2 4
 8 4 6 2 8 1 8 7 6 5 9 1 0 3 6 8 0 4 5 0 2 5 5 7 8 7 3 2 2 9 3 8 5 3 7 6 6
 3 7 3 7 0 9 6 7 1 6 6 8 2 7 4 3 3 6 3 9 3 4 9 6 6 4 4 0 1 2 9 9 8 3 6 8 1
 5 4 9 2 0 7 9 0 7 1 9 8 2 8 5 7 4 8 0 0 5 0 5 3 7 6 4 7 6 2 0 7 9 3 1 4 6
 8 8 1 6 3 2 3 4 0 4 9 6 0 2 7 2 0 1 4 4 1 0 4 4 1 0 7 2 8 2 5 7 6 3 2 3 8
 6 7 4 3 5 6 5 1 3 4 1 1 6 3 7 8 5 5 3 8 5 3 1 2 3 5 0 7 3 5 0 8 8 6 5 4 4
 9 9 4 4 7 4 1 3 1 5 1 0 9 6 5 9 0 4 5 0 1 7 5 0 0 1 4 5 8 1 9 6 8 2 2 8 5
 3 3 9 3 7 7 3 5 0 2 4 2 9 1 6 7 1 2 7 2 9 7 5 4 6 2 2 3 3 6 0 9 9 8 2 7 1
 5 6 1 7 2 5 3 8 0 8 1 2 0 6 2 1 7 7 1 9 8 6 0 2 4 2 7 7 7 1 5 3 4 3 8 9 2
 1 3 1 4 0 3 8 1 0 0 8 4 6 0 0 4 2 0 3 0 9 5 8 1 9 6 9 7 3 9 6 6 6 3 0 5 2
 6 5 8 6 1 6 9 6 7 8 4 0 7 3 1 1 9 8 5 0 5 0 1 4 5 4 8 4 6 5 7 6 5 4 9 5 2
 1 5 5 8 3 0 2 5 9 8 5 4 2 5 3 8 0 8 5 4 0 9 1 7 7 5 3 3 7 3 6 8 9 5 2 9 3
 1 9 9 1 1 9]

Algorithm Efficiency and Timing Code¶

If you do not know the run-time of your algorithm, you can time it for several values to get some insight as to how the time relates to the input size.

e.g. Here we time sorting arrays of random numbers for each of several sample sizes and the make a plot to see the relationship between run time and sample size.

In [12]:
sample_sizes = 1000000 * (1 + np.arange(10))
n_sizes = len(sample_sizes)
times = np.zeros(shape=n_sizes)
rng = np.random.default_rng(seed=0)
for i in np.arange(n_sizes):
    N = sample_sizes[i]
    a = np.random.sample(size=N) # random floats from [0.0, 1.0)
    start = time.time()
    discard = np.sort(a) # how fast is this algorithm?"
    end = time.time()
    times[i] = end - start
    print(f'i={i}, N={N}, times[i]={times[i]}')
i=0, N=1000000, times[i]=0.07830643653869629
i=1, N=2000000, times[i]=0.16305947303771973
i=2, N=3000000, times[i]=0.2500338554382324
i=3, N=4000000, times[i]=0.3455989360809326
i=4, N=5000000, times[i]=0.43494272232055664
i=5, N=6000000, times[i]=0.5314669609069824
i=6, N=7000000, times[i]=0.6353640556335449
i=7, N=8000000, times[i]=0.7294108867645264
i=8, N=9000000, times[i]=0.8157608509063721
i=9, N=10000000, times[i]=0.9213337898254395
In [13]:
plt.plot(sample_sizes, times, '.')
plt.title('Run time vs. array size')
plt.xlabel('array size N')
plt.ylabel('time (seconds)')
Out[13]:
Text(0, 0.5, 'time (seconds)')

It would be easy to make a regression model, or just estimate the model by eye, and then predict the run time for a particular N. (I am recommending extrapolating here as better than nothing, even though extrapolation is typically risky.)

Multicore computing¶

In [14]:
n_CPU = os.cpu_count()
print(f'n_CPU={n_CPU}.')

df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/circles.csv')
X = df[['x0', 'x1']]
y = df['y']

rng = np.random.default_rng(seed=0)
distributions = {
    'kernel': ('linear', 'rbf'),
    'C': uniform(loc=0, scale=1000) # uniform[loc, loc + scale]
}

times = np.zeros(shape=n_CPU+1) # ignore position [0]
numbers_of_CPUs = 1 + np.arange(start=0, stop=n_CPU)
for n_jobs in numbers_of_CPUs:
  clf = RandomizedSearchCV(svm.SVC(), param_distributions=distributions, n_iter=100, n_jobs=n_jobs)
  start = time.time()
  clf.fit(X, y)
  end = time.time()
  print(f'clf.best_score_={clf.best_score_:.3}, ' + f'clf.best_params_={clf.best_params_}')
  times[n_jobs] = end - start
  print(f'n_jobs={n_jobs}, clf.fit() took {times[n_jobs]:.3} seconds.')

plt.plot(numbers_of_CPUs, times[1:], 'or')
plt.title('RandomizedSearchCV() time vs. n_jobs')
plt.xticks(ticks=np.append(0, numbers_of_CPUs))
plt.xlabel('n_jobs')
plt.ylabel('time (seconds)')
_ = plt.ylim(0, 1.1*np.max(times))
n_CPU=4.
clf.best_score_=1.0, clf.best_params_={'C': 536.4894263381026, 'kernel': 'rbf'}
n_jobs=1, clf.fit() took 5.2 seconds.
clf.best_score_=1.0, clf.best_params_={'C': 370.63300024610214, 'kernel': 'rbf'}
n_jobs=2, clf.fit() took 3.95 seconds.
clf.best_score_=1.0, clf.best_params_={'C': 524.7741645779938, 'kernel': 'rbf'}
n_jobs=3, clf.fit() took 3.39 seconds.
clf.best_score_=1.0, clf.best_params_={'C': 585.7727633339724, 'kernel': 'rbf'}
n_jobs=4, clf.fit() took 2.29 seconds.