InΒ [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns

1. IndtroductionΒΆ

nfl_elo.csv contains game-by-game Elo ratings (Elo is a closed system where every point gained by one team is a point lost by another) and forecasts back to 1920.

InΒ [2]:
allNFL = pd.read_csv("nfl_elo.csv")
allNFL.head()
Out[2]:
date season neutral playoff team1 team2 elo1_pre elo2_pre elo_prob1 elo_prob2 ... qb2_game_value qb1_value_post qb2_value_post qbelo1_post qbelo2_post score1 score2 quality importance total_rating
0 1920-09-26 1920 0 NaN RII STP 1503.947 1300.000 0.824651 0.175349 ... NaN NaN NaN NaN NaN 48 0 NaN NaN NaN
1 1920-10-03 1920 0 NaN AKR WHE 1503.420 1300.000 0.824212 0.175788 ... NaN NaN NaN NaN NaN 43 0 NaN NaN NaN
2 1920-10-03 1920 0 NaN BFF WBU 1478.004 1300.000 0.802000 0.198000 ... NaN NaN NaN NaN NaN 32 6 NaN NaN NaN
3 1920-10-03 1920 0 NaN DAY COL 1493.002 1504.908 0.575819 0.424181 ... NaN NaN NaN NaN NaN 14 0 NaN NaN NaN
4 1920-10-03 1920 0 NaN RII MUN 1516.108 1478.004 0.644171 0.355829 ... NaN NaN NaN NaN NaN 45 0 NaN NaN NaN

5 rows Γ— 33 columns

2. PreprocessingΒΆ

since we want to make accurate predictions for the future, we shall only use data starting from the 2017 season. we shall also remove the neutral,importance and total rating column since more than half the values don't have a value for them.

InΒ [3]:
from numpy import NaN


nfl = allNFL.loc[allNFL["date"] >= "2017-09-07"]
nfl  = nfl.drop("neutral",axis=1)
nfl = nfl.drop("importance",axis=1)
nfl = nfl.drop("total_rating",axis=1)
nfl = nfl.fillna("n") 
#replaces the NaN's in the playoff column with n which represents a non-playoff game

cols = ["elo1_pre","elo2_pre","elo_prob1","elo_prob2","elo1_post","elo2_post",
        "qbelo1_pre","qbelo2_pre","qb1_value_pre","qb2_value_pre","qb1_adj","qb2_adj",
        "qbelo_prob1","qbelo_prob2","qb1_game_value","qb2_game_value","qb1_value_post",
        "qb2_value_post","qbelo1_post","qbelo2_post","score1","score2","quality"]
#outlier detection
def is_outlier(x):
    Q25, Q75 = x.quantile([.25,.75])
    I = Q75 - Q25
    return (x < Q25 - 1.5*I) |  (x > Q75 + 1.5*I)

outl = nfl[cols].apply(is_outlier)
for col in cols:
    out =  nfl.loc[outl[col],col]
    print( "outliers in " + col)
    if not out.empty:
        print(out)
    
nfl.head()
outliers in elo1_pre
15936    1227.490928
15956    1219.337610
17069    1217.456406
Name: elo1_pre, dtype: float64
outliers in elo2_pre
15926    1231.838645
15971    1210.773886
15982    1201.561463
17041    1233.393097
17053    1222.225724
Name: elo2_pre, dtype: float64
outliers in elo_prob1
outliers in elo_prob2
outliers in elo1_post
15936    1219.337610
15956    1210.773886
Name: elo1_post, dtype: float64
outliers in elo2_post
15906    1231.838645
15926    1227.490928
15971    1201.561463
15982    1200.365246
17041    1222.225724
17053    1217.456406
Name: elo2_post, dtype: float64
outliers in qbelo1_pre
15936    1241.083079
15956    1229.675102
17069    1227.834387
Name: qbelo1_pre, dtype: float64
outliers in qbelo2_pre
15926    1245.425052
15971    1220.838048
15982    1210.904201
17041    1244.823574
17053    1232.846190
Name: qbelo2_pre, dtype: float64
outliers in qb1_value_pre
16129   -6.005942
17355   -2.903320
Name: qb1_value_pre, dtype: float64
outliers in qb2_value_pre
16252    0.000000
16589    0.000000
16930    0.000000
16962    0.000000
16966    0.000000
17039   -4.891621
17272    0.000000
17358    0.000000
Name: qb2_value_pre, dtype: float64
outliers in qb1_adj
15777   -111.206263
15790     55.904372
15793    -87.361926
15813    -80.459917
15815     66.709965
            ...    
17357    -77.088545
17360    -80.449120
17363    -63.323206
17366    -73.127285
17375    -53.296133
Name: qb1_adj, Length: 176, dtype: float64
outliers in qb2_adj
15749    -52.805172
15755     52.558532
15758   -115.222694
15780     54.403417
15804   -131.788958
            ...    
17362    -59.889697
17364    -91.423174
17368   -139.668248
17370    -93.565657
17376    -48.540538
Name: qb2_adj, Length: 171, dtype: float64
outliers in qbelo_prob1
outliers in qbelo_prob2
outliers in qb1_game_value
15742   -235.201505
16055    561.903586
16062    536.886815
16205   -227.955756
16293   -186.157718
16319    542.956722
16345    628.580483
16370    556.976361
16379   -203.377584
16468    568.133430
16567    529.377014
16634   -210.655138
16671    552.757798
16889    567.489608
17038    541.279023
17049   -207.503337
17083    615.592584
17091    622.042187
17112    637.077418
17334   -241.574446
Name: qb1_game_value, dtype: float64
outliers in qb2_game_value
16010    563.905841
16028    527.794655
16045    568.784401
16060   -226.758577
16341   -314.069846
16390   -218.361351
16588    600.417397
16844   -293.851943
16859   -219.034447
Name: qb2_game_value, dtype: float64
outliers in qb1_value_post
16129    -2.90332
16713   -13.56946
Name: qb1_value_post, dtype: float64
outliers in qb2_value_post
15749    -1.495673
16015    -6.005942
16341   -21.645923
16966    -4.891621
17272    -6.109637
17358    -9.074079
17368     0.806843
Name: qb2_value_post, dtype: float64
outliers in qbelo1_post
15936    1229.675102
15956    1220.838048
Name: qbelo1_post, dtype: float64
outliers in qbelo2_post
15926    1241.083079
15971    1210.904201
15982    1208.150518
17041    1232.846190
17053    1227.834387
Name: qbelo2_post, dtype: float64
outliers in score1
15790    57
15818    52
15864    51
15895    54
16141    52
16167    54
16345    53
16390    51
16536    51
16765    52
16783    56
16909    54
17048    56
17053    50
17063    51
17288    54
17332    51
Name: score1, dtype: int64
outliers in score2
15866    51
16142    51
16167    51
16280    59
16331    55
16410    49
16597    49
17067    51
Name: score2, dtype: int64
outliers in quality
Out[3]:
date season playoff team1 team2 elo1_pre elo2_pre elo_prob1 elo_prob2 elo1_post ... qbelo_prob2 qb1_game_value qb2_game_value qb1_value_post qb2_value_post qbelo1_post qbelo2_post score1 score2 quality
15740 2017-09-07 2017 n NE KC 1687.395154 1613.148952 0.690309 0.309691 1646.529757 ... 0.253030 39.100603 400.020698 229.550400 195.440810 1626.616848 1605.309919 27 42 95.0
15741 2017-09-10 2017 n CLE PIT 1335.767660 1598.852911 0.242271 0.757729 1329.605337 ... 0.764130 52.193001 169.429875 86.018744 182.850392 1340.915881 1603.053780 18 21 27.0
15742 2017-09-10 2017 n CIN BAL 1515.969638 1491.099567 0.626524 0.373476 1476.195532 ... 0.412898 -235.201505 43.763990 123.092565 151.492584 1479.743800 1548.464635 0 20 56.0
15743 2017-09-10 2017 n BUF NYJ 1484.127683 1451.565526 0.636826 0.363174 1500.142289 ... 0.289442 161.895534 90.819847 163.301767 127.907467 1464.768579 1348.394459 21 12 10.0
15744 2017-09-10 2017 n HOU JAX 1502.139008 1381.984201 0.743804 0.256196 1451.208768 ... 0.327975 -115.102578 142.320362 27.442070 150.016385 1469.052384 1470.615187 7 29 28.0

5 rows Γ— 30 columns

3. Summary Data AnalysisΒΆ

.

statistical summaries of every numerical column in the form of box plots and numerical summaries

InΒ [4]:
col1 = ["elo1_pre","elo2_pre","elo2_post",
        "qbelo1_pre","qbelo2_pre","qb1_value_pre","qb2_value_post","qbelo1_post"]
col2 = ["elo_prob2","elo1_post","qb2_value_pre","qb1_adj","qb2_adj",
        "qbelo_prob1","qbelo_prob2","qb1_game_value","qb2_game_value","qb1_value_post",
        "qb2_value_post","qbelo1_post","score1","score2","quality"]

sns.catplot(data=nfl[col1],
     kind="box"
    )
sns.catplot(data=nfl[col2],
     kind="box"
    )

for col in cols:
    print(col)
    print(nfl[col].describe())
elo1_pre
count    1639.000000
mean     1512.499466
std       100.225540
min      1217.456406
25%      1444.568312
50%      1513.175041
75%      1582.497828
max      1777.933310
Name: elo1_pre, dtype: float64
elo2_pre
count    1639.000000
mean     1507.805605
std        97.034370
min      1201.561463
25%      1442.571702
50%      1508.127681
75%      1579.908910
max      1761.858581
Name: elo2_pre, dtype: float64
elo_prob1
count    1639.000000
mean        0.586480
std         0.169136
min         0.123942
25%         0.467316
50%         0.603330
75%         0.710388
max         0.953944
Name: elo_prob1, dtype: float64
elo_prob2
count    1639.000000
mean        0.413520
std         0.169136
min         0.046056
25%         0.289612
50%         0.396670
75%         0.532684
max         0.876058
Name: elo_prob2, dtype: float64
elo1_post
count    1639.000000
mean     1510.467249
std       101.999854
min      1210.773886
25%      1441.360522
50%      1510.990513
75%      1583.654653
max      1777.933310
Name: elo1_post, dtype: float64
elo2_post
count    1639.000000
mean     1509.837821
std        99.429424
min      1200.365246
25%      1442.314091
50%      1509.539094
75%      1582.412029
max      1775.119369
Name: elo2_post, dtype: float64
qbelo1_pre
count    1639.000000
mean     1510.173885
std        95.508050
min      1227.834387
25%      1444.237378
50%      1513.257169
75%      1577.819690
max      1757.263199
Name: qbelo1_pre, dtype: float64
qbelo2_pre
count    1639.000000
mean     1506.507542
std        92.303346
min      1210.904201
25%      1442.976575
50%      1506.610858
75%      1573.316037
max      1742.902172
Name: qbelo2_pre, dtype: float64
qb1_value_pre
count    1639.000000
mean      158.897306
std        59.232412
min        -6.005942
25%       120.028441
50%       158.196359
75%       201.699522
max       313.828383
Name: qb1_value_pre, dtype: float64
qb2_value_pre
count    1639.000000
mean      157.764127
std        58.490109
min        -4.891621
25%       120.409097
50%       155.669948
75%       198.870798
max       310.130678
Name: qb2_value_pre, dtype: float64
qb1_adj
count    1639.000000
mean       -5.703507
std        35.525193
min      -242.487678
25%       -10.871662
50%         2.008961
75%        13.295850
max        71.795946
Name: qb1_adj, dtype: float64
qb2_adj
count    1639.000000
mean       -5.671711
std        35.999316
min      -235.050690
25%       -10.257874
50%         1.979643
75%        13.551762
max        69.108875
Name: qb2_adj, dtype: float64
qbelo_prob1
count    1639.000000
mean        0.572435
std         0.181273
min         0.070229
25%         0.444650
50%         0.585947
75%         0.711116
max         0.964496
Name: qbelo_prob1, dtype: float64
qbelo_prob2
count    1639.000000
mean        0.427565
std         0.181273
min         0.035504
25%         0.288884
50%         0.414053
75%         0.555350
max         0.929771
Name: qbelo_prob2, dtype: float64
qb1_game_value
count    1639.000000
mean      168.777503
std       136.350794
min      -241.574446
25%        80.799033
50%       166.424500
75%       255.162020
max       637.077418
Name: qb1_game_value, dtype: float64
qb2_game_value
count    1639.000000
mean      149.472068
std       137.020214
min      -314.069846
25%        57.393301
50%       151.832530
75%       239.763865
max       600.417397
Name: qb2_game_value, dtype: float64
qb1_value_post
count    1639.000000
mean      159.885326
std        59.151489
min       -13.569460
25%       120.333154
50%       158.521746
75%       202.390239
max       310.130678
Name: qb1_value_post, dtype: float64
qb2_value_post
count    1639.000000
mean      156.934922
std        58.903806
min       -21.645923
25%       119.773154
50%       155.751957
75%       197.520046
max       313.828383
Name: qb2_value_post, dtype: float64
qbelo1_post
count    1639.000000
mean     1508.866310
std        97.279330
min      1220.838048
25%      1441.094966
50%      1510.827405
75%      1579.186263
max      1757.263199
Name: qbelo1_post, dtype: float64
qbelo2_post
count    1639.000000
mean     1507.815117
std        95.037135
min      1208.150518
25%      1443.262089
50%      1508.323190
75%      1576.703624
max      1754.511219
Name: qbelo2_post, dtype: float64
score1
count    1639.000000
mean       23.700427
std        10.134743
min         0.000000
25%        17.000000
50%        24.000000
75%        30.000000
max        57.000000
Name: score1, dtype: float64
score2
count    1639.000000
mean       22.198292
std         9.967792
min         0.000000
25%        16.000000
50%        22.000000
75%        29.000000
max        59.000000
Name: score2, dtype: float64
quality
count    1639.000000
mean       48.269067
std        29.395812
min         0.000000
25%        22.000000
50%        48.000000
75%        74.000000
max       100.000000
Name: quality, dtype: float64
No description has been provided for this image
No description has been provided for this image

Correlation between pre and post elo scores for both teams and correlation between pre and post elo rating for both quaterbacks. i found that the team elo ratings and the quarterback elo ratings for both teams before and after the game were strongly linked.

InΒ [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

colsC = nfl[["elo1_pre","elo1_post","qbelo2_pre","qbelo2_post","qbelo1_pre","qbelo1_post","elo2_pre","elo2_post"]]
C = colsC.corr()
sns.heatmap(C, annot=True,cmap = 'coolwarm',vmin=-1,vmax = 1)
Out[5]:
<Axes: >
No description has been provided for this image

4. DiscussionΒΆ

Questions :-

  1. Can the historical Elo ratings of NFL teams, combined with quarterback performance metrics(such qb elo ratings pre,during and post game) and game context (such as season, playoff status, and venue), predict whether a game will result in an upset (i.e., the team with the lower Elo rating wins)
  2. Can the combined Elo ratings of NFL teams before a game, along with quarterback performance metrics and game context (such as season, playoff status, and neutral venue), predict the total points scored in that game?