import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
df = pd.DataFrame({'color': ['green', 'yellow', 'red', 'green']})
df
color | |
---|---|
0 | green |
1 | yellow |
2 | red |
3 | green |
pd.get_dummies(df.color, drop_first=False) # also try drop_first=True
green | red | yellow | |
---|---|---|---|
0 | 1 | 0 | 0 |
1 | 0 | 0 | 1 |
2 | 0 | 1 | 0 |
3 | 1 | 0 | 0 |
df.join(pd.get_dummies(df.color, drop_first=False))
color | green | red | yellow | |
---|---|---|---|---|
0 | green | 1 | 0 | 0 |
1 | yellow | 0 | 0 | 1 |
2 | red | 0 | 1 | 0 |
3 | green | 1 | 0 | 0 |
df = pd.DataFrame({'age': [0, 4, 7, 14, 18, 46, 92]})
df
age | |
---|---|
0 | 0 |
1 | 4 |
2 | 7 |
3 | 14 |
4 | 18 |
5 | 46 |
6 | 92 |
pd.cut(df.age, bins=[0, 3, 18, 65, np.inf], right=False, labels=['baby', 'child', 'adult', 'senior'])
0 baby 1 child 2 child 3 child 4 adult 5 adult 6 senior Name: age, dtype: category Categories (4, object): ['baby' < 'child' < 'adult' < 'senior']
X = np.array([[0, 100], [1, 101], [2, 102], [3, 103]])
X
array([[ 0, 100], [ 1, 101], [ 2, 102], [ 3, 103]])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X) # do scaling
X_scaled
array([[0. , 0. ], [0.33333333, 0.33333333], [0.66666667, 0.66666667], [1. , 1. ]])
scaler.inverse_transform(X_scaled) # undo scaling
array([[ 0., 100.], [ 1., 101.], [ 2., 102.], [ 3., 103.]])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # do scaling
X_scaled
array([[-1.34164079, -1.34164079], [-0.4472136 , -0.4472136 ], [ 0.4472136 , 0.4472136 ], [ 1.34164079, 1.34164079]])
# check that it is doing what we expect on first column:
x = X[:, 0]
mu = np.mean(x)
sigma = np.std(x, ddof=0) # caution: ddof=1 gives usual sample standard deviation
print(f'For x={x}, mu={mu:.3}, sigma={sigma:.3}, z={(x - mu) / sigma}')
For x=[0 1 2 3], mu=1.5, sigma=1.12, z=[-1.34164079 -0.4472136 0.4472136 1.34164079]
scaler.inverse_transform(X_scaled) # undo scaling
array([[ 0., 100.], [ 1., 101.], [ 2., 102.], [ 3., 103.]])
df = pd.DataFrame({'color': ['green', None, 'red', 'green'],
'n': [0, 1, np.nan, 3]})
df
color | n | |
---|---|---|
0 | green | 0.0 |
1 | None | 1.0 |
2 | red | NaN |
3 | green | 3.0 |
imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None)
imp.fit_transform(df[['n']])
array([[0. ], [1. ], [1.33333333], [3. ]])
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
imp.fit_transform(df[['n']])
array([[ 0.], [ 1.], [-1.], [ 3.]])
imp = SimpleImputer(missing_values=None, strategy='most_frequent', fill_value=None)
imp.fit_transform(df[['color']])
array([['green'], ['green'], ['red'], ['green']], dtype=object)