import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
low = -4
high = 4
xplot = np.linspace(start=low, stop=high)
w = 1
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='black',
label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='black')
plt.title('Logistic/sigmoid function')
w = 1
b = 1
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='red',
label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='red')
w = 2
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='green',
label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='green')
plt.legend()
plt.show(block=False)
X = np.array([-1, 0, 0, 1]).reshape(-1, 1) # need 2D for .fit() below
y = np.array([0, 0, 1, 1])
N = y.shape[0]
model = linear_model.LogisticRegression(C=1000)
# First try commenting out next three lines and setting b and w by eye.
# Also try varying C, above.
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
print(f'predictions for X={X} and y={y} are y_hat={model.predict(X)}')
# plot data
plt.plot(X, y, 'o', color='black', label=r'data $\{(x_i, y_i)\}$')
plt.title('Toy logistic regression for (-1, 0), (0, 0), (0, 1), (1, 1)')
plt.xlabel('x')
plt.xlim(low, high)
margin = 0.1
plt.ylim(-(0 + margin), 1 + margin)
# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label=r'logistic curve $\hat{P}(y = 1)$')
# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
success_proportion_per_x_value[i] = np.sum(y[X[:, 0] == x_values[i]]) / x_counts[i]
probs = model.predict_proba(X)[:, 1] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
print('probs:')
print(probs)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
label='sample proportions')
plt.legend()
plt.savefig('toyLogistic.png')
plt.show(block=False)
intercept=[1.61360353e-05], slope=[5.83190071], training score=0.75 predictions for X=[[-1] [ 0] [ 0] [ 1]] and y=[0 0 1 1] are y_hat=[0 1 1 1] probs: [0.00292397 0.50000403 0.50000403 0.99707612]
on proportions of girls at various ages who have reached menarche (onset of menstruation).
df_raw = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche.csv')
df_raw
# The first row says "0 out of 376 girls with average age 9.21 have
# reached menarche." The tenth row says "29 out of 93 girls with
# average age 12.33 have reached menarche." The last row says "1049
# out of 1049 girls with average age 17.58 have reached menarche."
Age | Total | Menarche | |
---|---|---|---|
0 | 9.21 | 376 | 0 |
1 | 10.21 | 200 | 0 |
2 | 10.58 | 93 | 0 |
3 | 10.83 | 120 | 2 |
4 | 11.08 | 90 | 2 |
5 | 11.33 | 88 | 5 |
6 | 11.58 | 105 | 10 |
7 | 11.83 | 111 | 17 |
8 | 12.08 | 100 | 16 |
9 | 12.33 | 93 | 29 |
10 | 12.58 | 100 | 39 |
11 | 12.83 | 108 | 51 |
12 | 13.08 | 99 | 47 |
13 | 13.33 | 106 | 67 |
14 | 13.58 | 105 | 81 |
15 | 13.83 | 117 | 88 |
16 | 14.08 | 98 | 79 |
17 | 14.33 | 97 | 90 |
18 | 14.58 | 120 | 113 |
19 | 14.83 | 102 | 95 |
20 | 15.08 | 122 | 117 |
21 | 15.33 | 111 | 107 |
22 | 15.58 | 94 | 92 |
23 | 15.83 | 114 | 112 |
24 | 17.58 | 1049 | 1049 |
# I made a second data file called menarche_cases.csv from
# menarche.csv that gives one line for each girl in the study
# indicating her age and whether (1) or not (0) she has reached
# menarche. e.g. For the tenth row of menarche.csv, I made 29 rows
# "12.33,1" and 64=93-29 rows "12.33,0" in menarche_cases.csv.
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche_cases.csv')
df
age | reached_menarche | |
---|---|---|
0 | 9.21 | 0 |
1 | 9.21 | 0 |
2 | 9.21 | 0 |
3 | 9.21 | 0 |
4 | 9.21 | 0 |
... | ... | ... |
3913 | 17.58 | 1 |
3914 | 17.58 | 1 |
3915 | 17.58 | 1 |
3916 | 17.58 | 1 |
3917 | 17.58 | 1 |
3918 rows × 2 columns
X = df[['age']]
y = df.reached_menarche
model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
intercept=[-21.15317965], slope=[1.62634868], training score=0.9060745278203165
# plot data
low = 8
high = 20
plt.plot(df.age, y, '.', color='black', label='data (many duplicates)')
plt.xlim(low, high)
margin = 0.1
plt.ylim(0 - margin, 1 + margin)
plt.title('Proportions of girls who have reached menarche')
plt.xlabel('age')
plt.ylabel('proportion')
# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label='logistic curve')
# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
success_proportion_per_x_value[i] = np.sum(y[df.age == x_values[i]]) / x_counts[i]
probs = model.predict_proba(X)[:, 0] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
label='sample proportions')
plt.legend(loc='center right')
plt.show(block=False)
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X = X[:, 2:4] # retain only petal length and petal width
X = X[y < 2, :] # exclude y == 2 flowers to get binary y
y = y[y < 2]
model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
plt.plot(X[y == 0, 0], X[y == 0, 1], '.', color='black')
plt.plot(X[y == 1, 0], X[y == 1, 1], '.', color='red')
plt.xlim(0, 6)
plt.ylim(0, 3)
x_plot = np.linspace(0, 6)
plt.plot(x_plot, -(w[0]*x_plot + b) / w[1]) # Solve for x_2 (y-coordinate) wx + b = 0 = ln(p / (1 - p)) where p = 1/2.
[<matplotlib.lines.Line2D at 0x7fd1d0d8ce20>]