import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
import numpy as np
import pandas as pd
import seaborn as sns
1. IndtroductionΒΆ
nfl_elo.csv contains game-by-game Elo ratings (Elo is a closed system where every point gained by one team is a point lost by another) and forecasts back to 1920.
allNFL = pd.read_csv("nfl_elo.csv")
allNFL.head()
date | season | neutral | playoff | team1 | team2 | elo1_pre | elo2_pre | elo_prob1 | elo_prob2 | ... | qb2_game_value | qb1_value_post | qb2_value_post | qbelo1_post | qbelo2_post | score1 | score2 | quality | importance | total_rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1920-09-26 | 1920 | 0 | NaN | RII | STP | 1503.947 | 1300.000 | 0.824651 | 0.175349 | ... | NaN | NaN | NaN | NaN | NaN | 48 | 0 | NaN | NaN | NaN |
1 | 1920-10-03 | 1920 | 0 | NaN | AKR | WHE | 1503.420 | 1300.000 | 0.824212 | 0.175788 | ... | NaN | NaN | NaN | NaN | NaN | 43 | 0 | NaN | NaN | NaN |
2 | 1920-10-03 | 1920 | 0 | NaN | BFF | WBU | 1478.004 | 1300.000 | 0.802000 | 0.198000 | ... | NaN | NaN | NaN | NaN | NaN | 32 | 6 | NaN | NaN | NaN |
3 | 1920-10-03 | 1920 | 0 | NaN | DAY | COL | 1493.002 | 1504.908 | 0.575819 | 0.424181 | ... | NaN | NaN | NaN | NaN | NaN | 14 | 0 | NaN | NaN | NaN |
4 | 1920-10-03 | 1920 | 0 | NaN | RII | MUN | 1516.108 | 1478.004 | 0.644171 | 0.355829 | ... | NaN | NaN | NaN | NaN | NaN | 45 | 0 | NaN | NaN | NaN |
5 rows Γ 33 columns
2. PreprocessingΒΆ
since we want to make accurate predictions for the future, we shall only use data starting from the 2017 season. we shall also remove the neutral,importance and total rating column since more than half the values don't have a value for them.
from numpy import NaN
nfl = allNFL.loc[allNFL["date"] >= "2017-09-07"]
nfl = nfl.drop("neutral",axis=1)
nfl = nfl.drop("importance",axis=1)
nfl = nfl.drop("total_rating",axis=1)
nfl = nfl.fillna("n")
#replaces the NaN's in the playoff column with n which represents a non-playoff game
cols = ["elo1_pre","elo2_pre","elo_prob1","elo_prob2","elo1_post","elo2_post",
"qbelo1_pre","qbelo2_pre","qb1_value_pre","qb2_value_pre","qb1_adj","qb2_adj",
"qbelo_prob1","qbelo_prob2","qb1_game_value","qb2_game_value","qb1_value_post",
"qb2_value_post","qbelo1_post","qbelo2_post","score1","score2","quality"]
#outlier detection
def is_outlier(x):
Q25, Q75 = x.quantile([.25,.75])
I = Q75 - Q25
return (x < Q25 - 1.5*I) | (x > Q75 + 1.5*I)
outl = nfl[cols].apply(is_outlier)
for col in cols:
out = nfl.loc[outl[col],col]
print( "outliers in " + col)
if not out.empty:
print(out)
nfl.head()
outliers in elo1_pre 15936 1227.490928 15956 1219.337610 17069 1217.456406 Name: elo1_pre, dtype: float64 outliers in elo2_pre 15926 1231.838645 15971 1210.773886 15982 1201.561463 17041 1233.393097 17053 1222.225724 Name: elo2_pre, dtype: float64 outliers in elo_prob1 outliers in elo_prob2 outliers in elo1_post 15936 1219.337610 15956 1210.773886 Name: elo1_post, dtype: float64 outliers in elo2_post 15906 1231.838645 15926 1227.490928 15971 1201.561463 15982 1200.365246 17041 1222.225724 17053 1217.456406 Name: elo2_post, dtype: float64 outliers in qbelo1_pre 15936 1241.083079 15956 1229.675102 17069 1227.834387 Name: qbelo1_pre, dtype: float64 outliers in qbelo2_pre 15926 1245.425052 15971 1220.838048 15982 1210.904201 17041 1244.823574 17053 1232.846190 Name: qbelo2_pre, dtype: float64 outliers in qb1_value_pre 16129 -6.005942 17355 -2.903320 Name: qb1_value_pre, dtype: float64 outliers in qb2_value_pre 16252 0.000000 16589 0.000000 16930 0.000000 16962 0.000000 16966 0.000000 17039 -4.891621 17272 0.000000 17358 0.000000 Name: qb2_value_pre, dtype: float64 outliers in qb1_adj 15777 -111.206263 15790 55.904372 15793 -87.361926 15813 -80.459917 15815 66.709965 ... 17357 -77.088545 17360 -80.449120 17363 -63.323206 17366 -73.127285 17375 -53.296133 Name: qb1_adj, Length: 176, dtype: float64 outliers in qb2_adj 15749 -52.805172 15755 52.558532 15758 -115.222694 15780 54.403417 15804 -131.788958 ... 17362 -59.889697 17364 -91.423174 17368 -139.668248 17370 -93.565657 17376 -48.540538 Name: qb2_adj, Length: 171, dtype: float64 outliers in qbelo_prob1 outliers in qbelo_prob2 outliers in qb1_game_value 15742 -235.201505 16055 561.903586 16062 536.886815 16205 -227.955756 16293 -186.157718 16319 542.956722 16345 628.580483 16370 556.976361 16379 -203.377584 16468 568.133430 16567 529.377014 16634 -210.655138 16671 552.757798 16889 567.489608 17038 541.279023 17049 -207.503337 17083 615.592584 17091 622.042187 17112 637.077418 17334 -241.574446 Name: qb1_game_value, dtype: float64 outliers in qb2_game_value 16010 563.905841 16028 527.794655 16045 568.784401 16060 -226.758577 16341 -314.069846 16390 -218.361351 16588 600.417397 16844 -293.851943 16859 -219.034447 Name: qb2_game_value, dtype: float64 outliers in qb1_value_post 16129 -2.90332 16713 -13.56946 Name: qb1_value_post, dtype: float64 outliers in qb2_value_post 15749 -1.495673 16015 -6.005942 16341 -21.645923 16966 -4.891621 17272 -6.109637 17358 -9.074079 17368 0.806843 Name: qb2_value_post, dtype: float64 outliers in qbelo1_post 15936 1229.675102 15956 1220.838048 Name: qbelo1_post, dtype: float64 outliers in qbelo2_post 15926 1241.083079 15971 1210.904201 15982 1208.150518 17041 1232.846190 17053 1227.834387 Name: qbelo2_post, dtype: float64 outliers in score1 15790 57 15818 52 15864 51 15895 54 16141 52 16167 54 16345 53 16390 51 16536 51 16765 52 16783 56 16909 54 17048 56 17053 50 17063 51 17288 54 17332 51 Name: score1, dtype: int64 outliers in score2 15866 51 16142 51 16167 51 16280 59 16331 55 16410 49 16597 49 17067 51 Name: score2, dtype: int64 outliers in quality
date | season | playoff | team1 | team2 | elo1_pre | elo2_pre | elo_prob1 | elo_prob2 | elo1_post | ... | qbelo_prob2 | qb1_game_value | qb2_game_value | qb1_value_post | qb2_value_post | qbelo1_post | qbelo2_post | score1 | score2 | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15740 | 2017-09-07 | 2017 | n | NE | KC | 1687.395154 | 1613.148952 | 0.690309 | 0.309691 | 1646.529757 | ... | 0.253030 | 39.100603 | 400.020698 | 229.550400 | 195.440810 | 1626.616848 | 1605.309919 | 27 | 42 | 95.0 |
15741 | 2017-09-10 | 2017 | n | CLE | PIT | 1335.767660 | 1598.852911 | 0.242271 | 0.757729 | 1329.605337 | ... | 0.764130 | 52.193001 | 169.429875 | 86.018744 | 182.850392 | 1340.915881 | 1603.053780 | 18 | 21 | 27.0 |
15742 | 2017-09-10 | 2017 | n | CIN | BAL | 1515.969638 | 1491.099567 | 0.626524 | 0.373476 | 1476.195532 | ... | 0.412898 | -235.201505 | 43.763990 | 123.092565 | 151.492584 | 1479.743800 | 1548.464635 | 0 | 20 | 56.0 |
15743 | 2017-09-10 | 2017 | n | BUF | NYJ | 1484.127683 | 1451.565526 | 0.636826 | 0.363174 | 1500.142289 | ... | 0.289442 | 161.895534 | 90.819847 | 163.301767 | 127.907467 | 1464.768579 | 1348.394459 | 21 | 12 | 10.0 |
15744 | 2017-09-10 | 2017 | n | HOU | JAX | 1502.139008 | 1381.984201 | 0.743804 | 0.256196 | 1451.208768 | ... | 0.327975 | -115.102578 | 142.320362 | 27.442070 | 150.016385 | 1469.052384 | 1470.615187 | 7 | 29 | 28.0 |
5 rows Γ 30 columns
3. Summary Data AnalysisΒΆ
.
statistical summaries of every numerical column in the form of box plots and numerical summaries
col1 = ["elo1_pre","elo2_pre","elo2_post",
"qbelo1_pre","qbelo2_pre","qb1_value_pre","qb2_value_post","qbelo1_post"]
col2 = ["elo_prob2","elo1_post","qb2_value_pre","qb1_adj","qb2_adj",
"qbelo_prob1","qbelo_prob2","qb1_game_value","qb2_game_value","qb1_value_post",
"qb2_value_post","qbelo1_post","score1","score2","quality"]
sns.catplot(data=nfl[col1],
kind="box"
)
sns.catplot(data=nfl[col2],
kind="box"
)
for col in cols:
print(col)
print(nfl[col].describe())
elo1_pre count 1639.000000 mean 1512.499466 std 100.225540 min 1217.456406 25% 1444.568312 50% 1513.175041 75% 1582.497828 max 1777.933310 Name: elo1_pre, dtype: float64 elo2_pre count 1639.000000 mean 1507.805605 std 97.034370 min 1201.561463 25% 1442.571702 50% 1508.127681 75% 1579.908910 max 1761.858581 Name: elo2_pre, dtype: float64 elo_prob1 count 1639.000000 mean 0.586480 std 0.169136 min 0.123942 25% 0.467316 50% 0.603330 75% 0.710388 max 0.953944 Name: elo_prob1, dtype: float64 elo_prob2 count 1639.000000 mean 0.413520 std 0.169136 min 0.046056 25% 0.289612 50% 0.396670 75% 0.532684 max 0.876058 Name: elo_prob2, dtype: float64 elo1_post count 1639.000000 mean 1510.467249 std 101.999854 min 1210.773886 25% 1441.360522 50% 1510.990513 75% 1583.654653 max 1777.933310 Name: elo1_post, dtype: float64 elo2_post count 1639.000000 mean 1509.837821 std 99.429424 min 1200.365246 25% 1442.314091 50% 1509.539094 75% 1582.412029 max 1775.119369 Name: elo2_post, dtype: float64 qbelo1_pre count 1639.000000 mean 1510.173885 std 95.508050 min 1227.834387 25% 1444.237378 50% 1513.257169 75% 1577.819690 max 1757.263199 Name: qbelo1_pre, dtype: float64 qbelo2_pre count 1639.000000 mean 1506.507542 std 92.303346 min 1210.904201 25% 1442.976575 50% 1506.610858 75% 1573.316037 max 1742.902172 Name: qbelo2_pre, dtype: float64 qb1_value_pre count 1639.000000 mean 158.897306 std 59.232412 min -6.005942 25% 120.028441 50% 158.196359 75% 201.699522 max 313.828383 Name: qb1_value_pre, dtype: float64 qb2_value_pre count 1639.000000 mean 157.764127 std 58.490109 min -4.891621 25% 120.409097 50% 155.669948 75% 198.870798 max 310.130678 Name: qb2_value_pre, dtype: float64 qb1_adj count 1639.000000 mean -5.703507 std 35.525193 min -242.487678 25% -10.871662 50% 2.008961 75% 13.295850 max 71.795946 Name: qb1_adj, dtype: float64 qb2_adj count 1639.000000 mean -5.671711 std 35.999316 min -235.050690 25% -10.257874 50% 1.979643 75% 13.551762 max 69.108875 Name: qb2_adj, dtype: float64 qbelo_prob1 count 1639.000000 mean 0.572435 std 0.181273 min 0.070229 25% 0.444650 50% 0.585947 75% 0.711116 max 0.964496 Name: qbelo_prob1, dtype: float64 qbelo_prob2 count 1639.000000 mean 0.427565 std 0.181273 min 0.035504 25% 0.288884 50% 0.414053 75% 0.555350 max 0.929771 Name: qbelo_prob2, dtype: float64 qb1_game_value count 1639.000000 mean 168.777503 std 136.350794 min -241.574446 25% 80.799033 50% 166.424500 75% 255.162020 max 637.077418 Name: qb1_game_value, dtype: float64 qb2_game_value count 1639.000000 mean 149.472068 std 137.020214 min -314.069846 25% 57.393301 50% 151.832530 75% 239.763865 max 600.417397 Name: qb2_game_value, dtype: float64 qb1_value_post count 1639.000000 mean 159.885326 std 59.151489 min -13.569460 25% 120.333154 50% 158.521746 75% 202.390239 max 310.130678 Name: qb1_value_post, dtype: float64 qb2_value_post count 1639.000000 mean 156.934922 std 58.903806 min -21.645923 25% 119.773154 50% 155.751957 75% 197.520046 max 313.828383 Name: qb2_value_post, dtype: float64 qbelo1_post count 1639.000000 mean 1508.866310 std 97.279330 min 1220.838048 25% 1441.094966 50% 1510.827405 75% 1579.186263 max 1757.263199 Name: qbelo1_post, dtype: float64 qbelo2_post count 1639.000000 mean 1507.815117 std 95.037135 min 1208.150518 25% 1443.262089 50% 1508.323190 75% 1576.703624 max 1754.511219 Name: qbelo2_post, dtype: float64 score1 count 1639.000000 mean 23.700427 std 10.134743 min 0.000000 25% 17.000000 50% 24.000000 75% 30.000000 max 57.000000 Name: score1, dtype: float64 score2 count 1639.000000 mean 22.198292 std 9.967792 min 0.000000 25% 16.000000 50% 22.000000 75% 29.000000 max 59.000000 Name: score2, dtype: float64 quality count 1639.000000 mean 48.269067 std 29.395812 min 0.000000 25% 22.000000 50% 48.000000 75% 74.000000 max 100.000000 Name: quality, dtype: float64
Correlation between pre and post elo scores for both teams and correlation between pre and post elo rating for both quaterbacks. i found that the team elo ratings and the quarterback elo ratings for both teams before and after the game were strongly linked.
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
colsC = nfl[["elo1_pre","elo1_post","qbelo2_pre","qbelo2_post","qbelo1_pre","qbelo1_post","elo2_pre","elo2_post"]]
C = colsC.corr()
sns.heatmap(C, annot=True,cmap = 'coolwarm',vmin=-1,vmax = 1)
<Axes: >
4. DiscussionΒΆ
Questions :-
- Can the historical Elo ratings of NFL teams, combined with quarterback performance metrics(such qb elo ratings pre,during and post game) and game context (such as season, playoff status, and venue), predict whether a game will result in an upset (i.e., the team with the lower Elo rating wins)
- Can the combined Elo ratings of NFL teams before a game, along with quarterback performance metrics and game context (such as season, playoff status, and neutral venue), predict the total points scored in that game?