import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model


low = -4
high = 4
xplot = np.linspace(start=low, stop=high)

w = 1
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='black',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='black')
plt.title('Logistic/sigmoid function')

w = 1
b = 1
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='red',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='red')

w = 2
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='green',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='green')

plt.legend()
plt.show(block=False)


X = np.array([-1, 0, 0, 1]).reshape(-1, 1) # need 2D for .fit() below
y = np.array([0, 0, 1, 1])
N = y.shape[0]
model = linear_model.LogisticRegression(C=1000)
# First try commenting out next three lines and setting b and w by eye.
# Also try varying C, above.
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
print(f'predictions for X={X} and y={y} are y_hat={model.predict(X)}')

# plot data
plt.plot(X, y, 'o', color='black', label=r'data $\{(x_i, y_i)\}$')
plt.title('Toy logistic regression for (-1, 0), (0, 0), (0, 1), (1, 1)')
plt.xlabel('x')
plt.xlim(low, high)
margin = 0.1
plt.ylim(-(0 + margin), 1 + margin)

# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label=r'logistic curve $\hat{P}(y = 1)$')

# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
    success_proportion_per_x_value[i] = np.sum(y[X[:, 0] == x_values[i]]) / x_counts[i]

probs = model.predict_proba(X)[:, 1] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
print('probs:')
print(probs)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
         label='sample proportions')

plt.legend()
plt.savefig('toyLogistic.png')
plt.show(block=False)

intercept=[1.61360353e-05], slope=[5.83190071], training score=0.75
predictions for X=[[-1]
 [ 0]
 [ 0]
 [ 1]] and y=[0 0 1 1] are y_hat=[0 1 1 1]
probs:
[0.00292397 0.50000403 0.50000403 0.99707612]


df_raw = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche.csv')
df_raw
# The first row says "0 out of 376 girls with average age 9.21 have
# reached menarche." The tenth row says "29 out of 93 girls with
# average age 12.33 have reached menarche." The last row says "1049
# out of 1049 girls with average age 17.58 have reached menarche."


# I made a second data file called menarche_cases.csv from
# menarche.csv that gives one line for each girl in the study
# indicating her age and whether (1) or not (0) she has reached
# menarche. e.g. For the tenth row of menarche.csv, I made 29 rows
# "12.33,1" and 64=93-29 rows "12.33,0" in menarche_cases.csv.
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche_cases.csv')
df


X = df[['age']]
y = df.reached_menarche
model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')

intercept=[-21.15317965], slope=[1.62634868], training score=0.9060745278203165


# plot data
low = 8
high = 20
plt.plot(df.age, y, '.', color='black', label='data (many duplicates)')
plt.xlim(low, high)
margin = 0.1
plt.ylim(0 - margin, 1 + margin)
plt.title('Proportions of girls who have reached menarche')
plt.xlabel('age')
plt.ylabel('proportion')

# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label='logistic curve')

# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
    success_proportion_per_x_value[i] = np.sum(y[df.age == x_values[i]]) / x_counts[i]

probs = model.predict_proba(X)[:, 0] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
         label='sample proportions')

plt.legend(loc='center right')
plt.show(block=False)


from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X = X[:, 2:4] # retain only petal length and petal width
X = X[y < 2, :] # exclude y == 2 flowers to get binary y
y = y[y < 2]
model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
plt.plot(X[y == 0, 0], X[y == 0, 1], '.', color='black')
plt.plot(X[y == 1, 0], X[y == 1, 1], '.', color='red')
plt.xlim(0, 6)
plt.ylim(0, 3)
x_plot = np.linspace(0, 6)
plt.plot(x_plot, -(w[0]*x_plot + b) / w[1]) # Solve for x_2 (y-coordinate) wx + b = 0 = ln(p / (1 - p)) where p = 1/2.

[<matplotlib.lines.Line2D at 0x7fd1d0d8ce20>]

	Age	Total	Menarche
0	9.21	376	0
1	10.21	200	0
2	10.58	93	0
3	10.83	120	2
4	11.08	90	2
5	11.33	88	5
6	11.58	105	10
7	11.83	111	17
8	12.08	100	16
9	12.33	93	29
10	12.58	100	39
11	12.83	108	51
12	13.08	99	47
13	13.33	106	67
14	13.58	105	81
15	13.83	117	88
16	14.08	98	79
17	14.33	97	90
18	14.58	120	113
19	14.83	102	95
20	15.08	122	117
21	15.33	111	107
22	15.58	94	92
23	15.83	114	112
24	17.58	1049	1049

Check logistic curve:¶

Toy lecture example:¶

1D x real data example¶

Add 2D x example to show linear decision boundary.¶