import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


df = pd.DataFrame({'color': ['green', 'yellow', 'red', 'green']})
df


pd.get_dummies(df.color, drop_first=False) # also try drop_first=True


df.join(pd.get_dummies(df.color, drop_first=False))


df = pd.DataFrame({'age': [0, 4, 7, 14, 18, 46, 92]})
df


pd.cut(df.age, bins=[0, 3, 18, 65, np.inf], right=False, labels=['baby', 'child', 'adult', 'senior'])

0      baby
1     child
2     child
3     child
4     adult
5     adult
6    senior
Name: age, dtype: category
Categories (4, object): ['baby' < 'child' < 'adult' < 'senior']


X = np.array([[0, 100], [1, 101], [2, 102], [3, 103]])
X

array([[  0, 100],
       [  1, 101],
       [  2, 102],
       [  3, 103]])


scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X) # do scaling
X_scaled

array([[0.        , 0.        ],
       [0.33333333, 0.33333333],
       [0.66666667, 0.66666667],
       [1.        , 1.        ]])


scaler.inverse_transform(X_scaled) # undo scaling

array([[  0., 100.],
       [  1., 101.],
       [  2., 102.],
       [  3., 103.]])


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # do scaling
X_scaled

array([[-1.34164079, -1.34164079],
       [-0.4472136 , -0.4472136 ],
       [ 0.4472136 ,  0.4472136 ],
       [ 1.34164079,  1.34164079]])


# check that it is doing what we expect on first column:
x = X[:, 0]
mu = np.mean(x)
sigma = np.std(x, ddof=0) # caution: ddof=1 gives usual sample standard deviation
print(f'For x={x}, mu={mu:.3}, sigma={sigma:.3}, z={(x - mu) / sigma}')

For x=[0 1 2 3], mu=1.5, sigma=1.12, z=[-1.34164079 -0.4472136   0.4472136   1.34164079]


scaler.inverse_transform(X_scaled) # undo scaling

array([[  0., 100.],
       [  1., 101.],
       [  2., 102.],
       [  3., 103.]])


df = pd.DataFrame({'color': ['green', None, 'red', 'green'],
                  'n': [0, 1, np.nan, 3]})
df


imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None)
imp.fit_transform(df[['n']])

array([[0.        ],
       [1.        ],
       [1.33333333],
       [3.        ]])


imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
imp.fit_transform(df[['n']])

array([[ 0.],
       [ 1.],
       [-1.],
       [ 3.]])


imp = SimpleImputer(missing_values=None, strategy='most_frequent', fill_value=None)
imp.fit_transform(df[['color']])

array([['green'],
       ['green'],
       ['red'],
       ['green']], dtype=object)

One-hot encoding¶

Binning¶

Rescaling¶

Min-max normalization¶

Standardization¶

Data imputation¶

Replace missing value with mean¶

Replace missing value with a value outside feature's normal range¶

Replace missing string value with most frequent string¶

	color
0	green
1	yellow
2	red
3	green

	green	red	yellow
0	1	0	0
1	0	0	1
2	0	1	0
3	1	0	0

	color	green	red	yellow
0	green	1	0	0
1	yellow	0	0	1
2	red	0	1	0
3	green	1	0	0

	age
0	0
1	4
2	7
3	14
4	18
5	46
6	92