In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

Check logistic curve:¶

In [2]:
low = -4
high = 4
xplot = np.linspace(start=low, stop=high)

w = 1
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='black',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='black')
plt.title('Logistic/sigmoid function')

w = 1
b = 1
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='red',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='red')

w = 2
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='green',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='green')

plt.legend()
plt.show(block=False)

Toy lecture example:¶

In [3]:
X = np.array([-1, 0, 0, 1]).reshape(-1, 1) # need 2D for .fit() below
y = np.array([0, 0, 1, 1])
N = y.shape[0]
model = linear_model.LogisticRegression(C=1000)
# First try commenting out next three lines and setting b and w by eye.
# Also try varying C, above.
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
print(f'predictions for X={X} and y={y} are y_hat={model.predict(X)}')

# plot data
plt.plot(X, y, 'o', color='black', label=r'data $\{(x_i, y_i)\}$')
plt.title('Toy logistic regression for (-1, 0), (0, 0), (0, 1), (1, 1)')
plt.xlabel('x')
plt.xlim(low, high)
margin = 0.1
plt.ylim(-(0 + margin), 1 + margin)

# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label=r'logistic curve $\hat{P}(y = 1)$')

# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
    success_proportion_per_x_value[i] = np.sum(y[X[:, 0] == x_values[i]]) / x_counts[i]

probs = model.predict_proba(X)[:, 1] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
print('probs:')
print(probs)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
         label='sample proportions')

plt.legend()
plt.savefig('toyLogistic.png')
plt.show(block=False)
intercept=[1.61360353e-05], slope=[5.83190071], training score=0.75
predictions for X=[[-1]
 [ 0]
 [ 0]
 [ 1]] and y=[0 0 1 1] are y_hat=[0 1 1 1]
probs:
[0.00292397 0.50000403 0.50000403 0.99707612]

1D x real data example¶

on proportions of girls at various ages who have reached menarche (onset of menstruation).

In [4]:
df_raw = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche.csv')
df_raw
# The first row says "0 out of 376 girls with average age 9.21 have
# reached menarche." The tenth row says "29 out of 93 girls with
# average age 12.33 have reached menarche." The last row says "1049
# out of 1049 girls with average age 17.58 have reached menarche."
Out[4]:
Age Total Menarche
0 9.21 376 0
1 10.21 200 0
2 10.58 93 0
3 10.83 120 2
4 11.08 90 2
5 11.33 88 5
6 11.58 105 10
7 11.83 111 17
8 12.08 100 16
9 12.33 93 29
10 12.58 100 39
11 12.83 108 51
12 13.08 99 47
13 13.33 106 67
14 13.58 105 81
15 13.83 117 88
16 14.08 98 79
17 14.33 97 90
18 14.58 120 113
19 14.83 102 95
20 15.08 122 117
21 15.33 111 107
22 15.58 94 92
23 15.83 114 112
24 17.58 1049 1049
In [5]:
# I made a second data file called menarche_cases.csv from
# menarche.csv that gives one line for each girl in the study
# indicating her age and whether (1) or not (0) she has reached
# menarche. e.g. For the tenth row of menarche.csv, I made 29 rows
# "12.33,1" and 64=93-29 rows "12.33,0" in menarche_cases.csv.
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche_cases.csv')
df
Out[5]:
age reached_menarche
0 9.21 0
1 9.21 0
2 9.21 0
3 9.21 0
4 9.21 0
... ... ...
3913 17.58 1
3914 17.58 1
3915 17.58 1
3916 17.58 1
3917 17.58 1

3918 rows × 2 columns

In [6]:
X = df[['age']]
y = df.reached_menarche
model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
intercept=[-21.15317965], slope=[1.62634868], training score=0.9060745278203165
In [7]:
# plot data
low = 8
high = 20
plt.plot(df.age, y, '.', color='black', label='data (many duplicates)')
plt.xlim(low, high)
margin = 0.1
plt.ylim(0 - margin, 1 + margin)
plt.title('Proportions of girls who have reached menarche')
plt.xlabel('age')
plt.ylabel('proportion')

# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label='logistic curve')

# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
    success_proportion_per_x_value[i] = np.sum(y[df.age == x_values[i]]) / x_counts[i]

probs = model.predict_proba(X)[:, 0] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
         label='sample proportions')

plt.legend(loc='center right')
plt.show(block=False)

Add 2D x example to show linear decision boundary.¶

In [8]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X = X[:, 2:4] # retain only petal length and petal width
X = X[y < 2, :] # exclude y == 2 flowers to get binary y
y = y[y < 2]
model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
plt.plot(X[y == 0, 0], X[y == 0, 1], '.', color='black')
plt.plot(X[y == 1, 0], X[y == 1, 1], '.', color='red')
plt.xlim(0, 6)
plt.ylim(0, 3)
x_plot = np.linspace(0, 6)
plt.plot(x_plot, -(w[0]*x_plot + b) / w[1]) # Solve for x_2 (y-coordinate) wx + b = 0 = ln(p / (1 - p)) where p = 1/2.
Out[8]:
[<matplotlib.lines.Line2D at 0x7fd1d0d8ce20>]