# NFL data
from https://pages.stat.wisc.edu/~jgillett/451/project/presentations/18.pdf

[field descriptions](https://www.nflfastr.com/articles/field_descriptions.html)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nfl_data_py as nfl
from sklearn import linear_model
import os
import urllib

In [None]:
#import sys
#!{sys.executable} -m pip install nfl_data_py

In [None]:
df = None
url = 'http://www.stat.wisc.edu/~jgillett/451/data/_BUG_field_goals.csv'
try:
    df = pd.read_csv(url)
    print(f'loaded data from url={url}')
except urllib.error.URLError:
    print(f'url={url} is not found')

if df is None:
    years = list(np.arange(2010, 2022))
    data = nfl.import_pbp_data(years = years, downcast=True, cache=False, alt_path=None)
    print(f'data.shape={data.shape}')
    print(data.head())
    df = data.loc[data.field_goal_attempt == 1, ['kick_distance', 'field_goal_result']]
    print('loaded data from nfl.import_pbp_data()')
    df.to_csv(path_or_buf='field_goals.csv', index=False, float_format='%.3f')

display(df.head())

In [None]:
model = linear_model.LogisticRegression(C=1000)
X = df[['kick_distance']]
y = (df.field_goal_result == 'made')
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
#print(f'predictions for X={X} and y={y} are y_hat={model.predict(X)}')

sample_proportion_made = np.zeros(100)
min_distance = int(np.min(df.kick_distance))
max_distance = int(np.max(df.kick_distance))
for i in range(100):
    df_i = df[df.kick_distance == i]
    if df_i.shape[0] == 0:
        sample_proportion_made[i] = 0
    else:
        sample_proportion_made[i] = np.sum(df_i.field_goal_result == 'made') / df_i.shape[0]

plt.plot(X.kick_distance, y.astype(int), 'o', color='black', alpha=.1, label=r'(x=distance, y=made)')

plt.plot(np.arange(start=np.min(df.kick_distance),
                   stop=np.max(df.kick_distance) + 1,
                   step=1),
         sample_proportion_made[min_distance:(max_distance + 1)],
         'o', label=r'(x=distance, y=proportion)')
plt.xlim((0, 100))
plt.xlabel('kick distance (yards)')
plt.ylabel('proportion made')
plt.title('12,410 NFL Field Goals, 2010-2022')




xplot = np.linspace(start=0, stop=100)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label=r'logistic curve $\hat{P}(y = 1)$')
plt.legend(loc='center left')
plt.savefig(fname='NFL_field_goal.png')

In [None]:
df.shape

In [None]:
df.kick_distance[df.field_goal_result == 'made'].sort_values()

In [None]:
display(pd.DataFrame({'made': sample_proportion_made})[60:70])

In [None]:
df.kick_distance.sort_values()[df.kick_distance.sort_values() > 62]

In [None]:
df.field_goal_result[df.kick_distance == 66]