import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
import time
import os
y = np.array([0, 0, 0, 0, 0, 1]) # 5 zeros, 1 one
N = y.shape # 6
counts = np.bincount(y) # array([5, 1])
n_classes = counts.shape # 2
C_1, C_2 = N / (n_classes * counts) # 0.6, 3
print(f'counts={counts}, C_1={C_1}, C_2={C_2}')
counts=[5 1], C_1=0.6, C_2=3.0
# Do 'New > Terminal' and then run 'conda install -c conda-forge imbalanced-learn' to install package.
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
X = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1)
y = np.array([0, 0, 0, 0, 1, 1])
rs = RandomOverSampler()
X_resampled, y_resampled = rs.fit_resample(X, y)
print(f'Oversampling: X_resampled={X_resampled},\ny_resampled={y_resampled}')
rs = RandomUnderSampler()
X_resampled, y_resampled = rs.fit_resample(X, y)
print(f'Undersampling: X_resampled={X_resampled},\ny_resampled={y_resampled}')
Oversampling: X_resampled=[[1] [2] [3] [4] [5] [6] [5] [5]], y_resampled=[0 0 0 0 1 1 1 1] Undersampling: X_resampled=[[1] [2] [5] [6]], y_resampled=[0 0 1 1]
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
clf = svm.SVC(kernel="linear", C=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
0.98
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
0.8066666666666666
clf = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
0.9822222222222222
clf = LogisticRegression(max_iter=3000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
/home/jgillett/.local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
0.9644444444444444
estimators = [
('SVM', svm.SVC(kernel="linear", C=1000)),
('kNN', KNeighborsClassifier(n_neighbors=1, metric='euclidean')),
('DecisionTree', DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0))
]
clf = StackingClassifier(
estimators=estimators, final_estimator=LogisticRegression(max_iter=3000)
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
0.9888888888888889
# inspect components of clf:
print(f'clf.estimators_={clf.estimators_}')
print(f'clf.final_estimator_={clf.final_estimator_}')
print(f'clf.stack_method_={clf.stack_method_}')
clf.estimators_=[SVC(C=1000, kernel='linear'), KNeighborsClassifier(metric='euclidean', n_neighbors=1), DecisionTreeClassifier(criterion='entropy', random_state=0)] clf.final_estimator_=LogisticRegression(max_iter=3000) clf.stack_method_=['decision_function', 'predict_proba', 'predict_proba']
# reproduce the StackingClassifier()'s output without using StackingClassifier(),
# directly from the stacking description in the lecture notes:
np.set_printoptions(precision=3)
y1_hat = clf.estimators_[0].decision_function(X_test)
print(f'y1_hat{y1_hat.shape}={y1_hat}')
y2_hat = clf.estimators_[1].predict_proba(X_test)
print(f'y2_hat{y2_hat.shape}={y2_hat}')
y3_hat = clf.estimators_[2].predict_proba(X_test)
print(f'y3_hat{y3_hat.shape}={y3_hat}')
x_hat = np.column_stack((y1_hat, y2_hat, y3_hat))
print(f'x_hat{x_hat.shape}={x_hat}')
y_hat = clf.final_estimator_.predict(x_hat)
print(f'y_hat{y_hat.shape}={y_hat}')
y_hat_from_clf = clf.predict(X_test)
print(f'y_hat_from_clf{y_hat_from_clf.shape}={y_hat_from_clf}')
assert np.all(y_hat == y_hat_from_clf)
y1_hat(450, 10)=[[ 6.289 1.732 9.307 ... 3.821 4.77 6.239] [ 9.312 1.697 2.735 ... 0.703 6.308 5.17 ] [ 3.787 5.252 -0.31 ... 7.276 6.306 3.767] ... [-0.316 9.314 7.279 ... 4.764 7.305 2.711] [ 2.785 9.288 0.733 ... 1.715 7.311 4.926] [ 1.729 8.295 -0.319 ... 6.264 5.283 9.32 ]] y2_hat(450, 10)=[[0. 0. 1. ... 0. 0. 0.] [1. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] ... [0. 1. 0. ... 0. 0. 0.] [0. 1. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 1.]] y3_hat(450, 10)=[[0. 0. 1. ... 0. 0. 0.] [1. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] ... [0. 0. 1. ... 0. 0. 0.] [0. 0. 1. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 1.]] x_hat(450, 30)=[[ 6.289 1.732 9.307 ... 0. 0. 0. ] [ 9.312 1.697 2.735 ... 0. 0. 0. ] [ 3.787 5.252 -0.31 ... 0. 0. 0. ] ... [-0.316 9.314 7.279 ... 0. 0. 0. ] [ 2.785 9.288 0.733 ... 0. 0. 0. ] [ 1.729 8.295 -0.319 ... 0. 0. 1. ]] y_hat(450,)=[2 0 4 9 4 1 2 4 6 7 9 1 8 0 9 8 2 9 7 7 0 2 6 7 2 1 5 7 2 4 3 4 3 6 1 2 4 8 4 6 2 8 1 8 7 6 5 9 1 0 3 6 8 0 4 5 0 2 5 5 7 8 7 3 2 2 9 3 8 5 3 7 6 6 3 7 3 7 0 9 6 7 1 6 6 8 2 7 4 3 3 6 3 9 3 4 9 6 6 4 4 0 1 2 9 9 8 3 6 8 1 5 4 9 2 0 7 9 0 7 1 9 8 2 8 5 7 4 8 0 0 5 0 5 3 7 6 4 7 6 2 0 7 9 3 1 4 6 8 8 1 6 3 2 3 4 0 4 9 6 0 2 7 2 0 1 4 4 1 0 4 4 1 0 7 2 8 2 5 7 6 3 2 3 8 6 7 4 3 5 6 5 1 3 4 1 1 6 3 7 8 5 5 3 8 5 3 1 2 3 5 0 7 3 5 0 8 8 6 5 4 4 9 9 4 4 7 4 1 3 1 5 1 0 9 6 5 9 0 4 5 0 1 7 5 0 0 1 4 5 8 1 9 6 8 2 2 8 5 3 3 9 3 7 7 3 5 0 2 4 2 9 1 6 7 1 2 7 2 9 7 5 4 6 2 2 3 3 6 0 9 9 8 2 7 1 5 6 1 7 2 5 3 8 0 8 1 2 0 6 2 1 7 7 1 9 8 6 0 2 4 2 7 7 7 1 5 3 4 3 8 9 2 1 3 1 4 0 3 8 1 0 0 8 4 6 0 0 4 2 0 3 0 9 5 8 1 9 6 9 7 3 9 6 6 6 3 0 5 2 6 5 8 6 1 6 9 6 7 8 4 0 7 3 1 1 9 8 5 0 5 0 1 4 5 4 8 4 6 5 7 6 5 4 9 5 2 1 5 5 8 3 0 2 5 9 8 5 4 2 5 3 8 0 8 5 4 0 9 1 7 7 5 3 3 7 3 6 8 9 5 2 9 3 1 9 9 1 1 9] y_hat_from_clf(450,)=[2 0 4 9 4 1 2 4 6 7 9 1 8 0 9 8 2 9 7 7 0 2 6 7 2 1 5 7 2 4 3 4 3 6 1 2 4 8 4 6 2 8 1 8 7 6 5 9 1 0 3 6 8 0 4 5 0 2 5 5 7 8 7 3 2 2 9 3 8 5 3 7 6 6 3 7 3 7 0 9 6 7 1 6 6 8 2 7 4 3 3 6 3 9 3 4 9 6 6 4 4 0 1 2 9 9 8 3 6 8 1 5 4 9 2 0 7 9 0 7 1 9 8 2 8 5 7 4 8 0 0 5 0 5 3 7 6 4 7 6 2 0 7 9 3 1 4 6 8 8 1 6 3 2 3 4 0 4 9 6 0 2 7 2 0 1 4 4 1 0 4 4 1 0 7 2 8 2 5 7 6 3 2 3 8 6 7 4 3 5 6 5 1 3 4 1 1 6 3 7 8 5 5 3 8 5 3 1 2 3 5 0 7 3 5 0 8 8 6 5 4 4 9 9 4 4 7 4 1 3 1 5 1 0 9 6 5 9 0 4 5 0 1 7 5 0 0 1 4 5 8 1 9 6 8 2 2 8 5 3 3 9 3 7 7 3 5 0 2 4 2 9 1 6 7 1 2 7 2 9 7 5 4 6 2 2 3 3 6 0 9 9 8 2 7 1 5 6 1 7 2 5 3 8 0 8 1 2 0 6 2 1 7 7 1 9 8 6 0 2 4 2 7 7 7 1 5 3 4 3 8 9 2 1 3 1 4 0 3 8 1 0 0 8 4 6 0 0 4 2 0 3 0 9 5 8 1 9 6 9 7 3 9 6 6 6 3 0 5 2 6 5 8 6 1 6 9 6 7 8 4 0 7 3 1 1 9 8 5 0 5 0 1 4 5 4 8 4 6 5 7 6 5 4 9 5 2 1 5 5 8 3 0 2 5 9 8 5 4 2 5 3 8 0 8 5 4 0 9 1 7 7 5 3 3 7 3 6 8 9 5 2 9 3 1 9 9 1 1 9]
If you do not know the run-time of your algorithm, you can time it for several values to get some insight as to how the time relates to the input size.
e.g. Here we time sorting arrays of random numbers for each of several sample sizes and the make a plot to see the relationship between run time and sample size.
sample_sizes = 1000000 * (1 + np.arange(10))
n_sizes = len(sample_sizes)
times = np.zeros(shape=n_sizes)
rng = np.random.default_rng(seed=0)
for i in np.arange(n_sizes):
N = sample_sizes[i]
a = np.random.sample(size=N) # random floats from [0.0, 1.0)
start = time.time()
discard = np.sort(a) # how fast is this algorithm?"
end = time.time()
times[i] = end - start
print(f'i={i}, N={N}, times[i]={times[i]}')
i=0, N=1000000, times[i]=0.07830643653869629 i=1, N=2000000, times[i]=0.16305947303771973 i=2, N=3000000, times[i]=0.2500338554382324 i=3, N=4000000, times[i]=0.3455989360809326 i=4, N=5000000, times[i]=0.43494272232055664 i=5, N=6000000, times[i]=0.5314669609069824 i=6, N=7000000, times[i]=0.6353640556335449 i=7, N=8000000, times[i]=0.7294108867645264 i=8, N=9000000, times[i]=0.8157608509063721 i=9, N=10000000, times[i]=0.9213337898254395
plt.plot(sample_sizes, times, '.')
plt.title('Run time vs. array size')
plt.xlabel('array size N')
plt.ylabel('time (seconds)')
Text(0, 0.5, 'time (seconds)')
It would be easy to make a regression model, or just estimate the model by eye, and then predict the run time for a particular N. (I am recommending extrapolating here as better than nothing, even though extrapolation is typically risky.)
n_CPU = os.cpu_count()
print(f'n_CPU={n_CPU}.')
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/circles.csv')
X = df[['x0', 'x1']]
y = df['y']
rng = np.random.default_rng(seed=0)
distributions = {
'kernel': ('linear', 'rbf'),
'C': uniform(loc=0, scale=1000) # uniform[loc, loc + scale]
}
times = np.zeros(shape=n_CPU+1) # ignore position [0]
numbers_of_CPUs = 1 + np.arange(start=0, stop=n_CPU)
for n_jobs in numbers_of_CPUs:
clf = RandomizedSearchCV(svm.SVC(), param_distributions=distributions, n_iter=100, n_jobs=n_jobs)
start = time.time()
clf.fit(X, y)
end = time.time()
print(f'clf.best_score_={clf.best_score_:.3}, ' + f'clf.best_params_={clf.best_params_}')
times[n_jobs] = end - start
print(f'n_jobs={n_jobs}, clf.fit() took {times[n_jobs]:.3} seconds.')
plt.plot(numbers_of_CPUs, times[1:], 'or')
plt.title('RandomizedSearchCV() time vs. n_jobs')
plt.xticks(ticks=np.append(0, numbers_of_CPUs))
plt.xlabel('n_jobs')
plt.ylabel('time (seconds)')
_ = plt.ylim(0, 1.1*np.max(times))
n_CPU=4. clf.best_score_=1.0, clf.best_params_={'C': 536.4894263381026, 'kernel': 'rbf'} n_jobs=1, clf.fit() took 5.2 seconds. clf.best_score_=1.0, clf.best_params_={'C': 370.63300024610214, 'kernel': 'rbf'} n_jobs=2, clf.fit() took 3.95 seconds. clf.best_score_=1.0, clf.best_params_={'C': 524.7741645779938, 'kernel': 'rbf'} n_jobs=3, clf.fit() took 3.39 seconds. clf.best_score_=1.0, clf.best_params_={'C': 585.7727633339724, 'kernel': 'rbf'} n_jobs=4, clf.fit() took 2.29 seconds.